[llvm] r311740 - [x86] Back out one aspect of r311318: don't generically set

Thu Aug 24 17:56:05 PDT 2017

Author: chandlerc
Date: Thu Aug 24 17:56:05 2017
New Revision: 311740

URL: http://llvm.org/viewvc/llvm-project?rev=311740&view=rev
Log:
[x86] Back out one aspect of r311318: don't generically set
FeatureSlowUAMem32.

The idea was to mark things that are slow on widely available processors
as slow in the generic CPU so that the code generated for that CPU would
be fast across those processors. However, for this feature that doesn't
work out very well at all.

The problem here is that you can very easily enable AVX or AVX2 on top
of this generic CPU. For example, this can happen just by using AVX2
intrinsics from Clang within a region of code guarded by a dynamic CPU
feature test. When you do that, the generated code with SlowUAMem32 set
is ... amazingly slower. The problem is that there really aren't very
good alternatives to the unaligned loads, and so our vector codegen
regresses significantly.

The other issue is that there are plenty of AMD CPUs with AVX1 that
don't set FeatureSlowUAMem32 and so we shouldn't just check for AVX2
instead of this special feature. =/

It would be nice to have the target attriute logic be able to
enable/disable more than just one feature at a time and control this in
a more fine grained and useful way, but that doesn't seem easy. Given
that it is only Sandybridge and Ivybridge that set this feature, for now
I'm just backing it out of the generic CPU. That has the additional
advantage of going back to the previous state that people seemed vaguely
happy with.

Modified:
    llvm/trunk/lib/Target/X86/X86.td
    llvm/trunk/test/CodeGen/X86/avx-schedule.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll

Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=311740&r1=311739&r2=311740&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Thu Aug 24 17:56:05 2017
@@ -874,8 +874,7 @@ def : ProcessorModel<"x86-64", SandyBrid
   Feature64Bit,
   FeatureSlow3OpsLEA,
   FeatureSlowBTMem,
-  FeatureSlowIncDec,
-  FeatureSlowUAMem32
+  FeatureSlowIncDec
 ]>;
 
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/test/CodeGen/X86/avx-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-schedule.ll?rev=311740&r1=311739&r2=311740&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-schedule.ll Thu Aug 24 17:56:05 2017
@@ -752,9 +752,7 @@ define <8 x float> @test_cvtdq2ps(<8 x i
 ; GENERIC-LABEL: test_cvtdq2ps:
 ; GENERIC:       # BB#0:
 ; GENERIC-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
 ; GENERIC-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -1956,11 +1954,9 @@ define <8 x float> @test_movsldup(<8 x f
 define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ; GENERIC-LABEL: test_movupd:
 ; GENERIC:       # BB#0:
-; GENERIC-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vmovupd (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
-; GENERIC-NEXT:    vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    vmovupd %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movupd:
@@ -2001,11 +1997,9 @@ define <4 x double> @test_movupd(<4 x do
 define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ; GENERIC-LABEL: test_movups:
 ; GENERIC:       # BB#0:
-; GENERIC-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vmovups (%rdi), %ymm0 # sched: [7:0.50]
 ; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; GENERIC-NEXT:    vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
-; GENERIC-NEXT:    vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT:    vmovups %ymm0, (%rsi) # sched: [5:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_movups:

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll?rev=311740&r1=311739&r2=311740&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll Thu Aug 24 17:56:05 2017
@@ -299,8 +299,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (
 define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
 ; ALL-LABEL: shuffle_v16f32_extract_256:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovups 32(%rsi), %xmm0
-; ALL-NEXT:    vinsertf128 $1, 48(%rsi), %ymm0, %ymm0
+; ALL-NEXT:    vmovups 32(%rsi), %ymm0
 ; ALL-NEXT:    retq
   %ptr_a = bitcast float* %a to <16 x float>*
   %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4