[llvm] r222544 - Add a feature flag for slow 32-byte unaligned memory accesses [x86].
Sanjay Patel
spatel at rotateright.com
Fri Nov 21 09:40:04 PST 2014
Author: spatel
Date: Fri Nov 21 11:40:04 2014
New Revision: 222544
URL: http://llvm.org/viewvc/llvm-project?rev=222544&view=rev
Log:
Add a feature flag for slow 32-byte unaligned memory accesses [x86].
This patch adds a feature flag to avoid unaligned 32-byte load/store AVX codegen
for Sandy Bridge and Ivy Bridge. There is no functionality change intended for
those chips. Previously, the absence of AVX2 was being used as a proxy to detect
this feature. But that hindered codegen for AVX-enabled AMD chips such as btver2
that do not have the 32-byte unaligned access slowdown.
Performance measurements are included in PR21541 ( http://llvm.org/bugs/show_bug.cgi?id=21541 ).
Differential Revision: http://reviews.llvm.org/D6355
Added:
llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll
Removed:
llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll
Modified:
llvm/trunk/lib/Target/X86/X86.td
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86Subtarget.cpp
llvm/trunk/lib/Target/X86/X86Subtarget.h
Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=222544&r1=222543&r2=222544&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Fri Nov 21 11:40:04 2014
@@ -82,6 +82,9 @@ def FeatureSlowSHLD : SubtargetFeature<"
def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
"IsUAMemFast", "true",
"Fast unaligned memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+ "IsUAMem32Slow", "true",
+ "Slow unaligned 32-byte memory access">;
def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
"Support SSE 4a instructions",
[FeatureSSE3]>;
@@ -271,12 +274,14 @@ def : ProcessorModel<"westmere", SandyBr
// rather than a superset.
def : ProcessorModel<"corei7-avx", SandyBridgeModel,
[FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+ FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES,
+ FeaturePCLMUL]>;
// Ivy Bridge
def : ProcessorModel<"core-avx-i", SandyBridgeModel,
[FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
- FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
- FeatureF16C, FeatureFSGSBase]>;
+ FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES,
+ FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+ FeatureFSGSBase]>;
// Haswell
def : ProcessorModel<"core-avx2", HaswellModel,
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=222544&r1=222543&r2=222544&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Nov 21 11:40:04 2014
@@ -24376,11 +24376,12 @@ static SDValue PerformLOADCombine(SDNode
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // On Sandybridge unaligned 256bit loads are inefficient.
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations.
ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
- if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
@@ -24423,13 +24424,11 @@ static SDValue PerformSTORECombine(SDNod
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we are saving a concatenation of two XMM registers, perform two stores.
- // On Sandy Bridge, 256-bit memory operations are executed by two
- // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
- // memory operation.
+ // If we are saving a concatenation of two XMM registers and 32-byte stores
+ // are slow, such as on Sandy Bridge, perform two 16-byte stores.
unsigned Alignment = St->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
- if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
StVT == VT && !IsAligned) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=222544&r1=222543&r2=222544&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Fri Nov 21 11:40:04 2014
@@ -264,6 +264,7 @@ void X86Subtarget::initializeEnvironment
IsBTMemSlow = false;
IsSHLDSlow = false;
IsUAMemFast = false;
+ IsUAMem32Slow = false;
HasVectorUAMem = false;
HasCmpxchg16b = false;
UseLeaForSP = false;
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=222544&r1=222543&r2=222544&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Fri Nov 21 11:40:04 2014
@@ -159,6 +159,9 @@ protected:
/// IsUAMemFast - True if unaligned memory access is fast.
bool IsUAMemFast;
+ /// True if unaligned 32-byte memory accesses are slow.
+ bool IsUAMem32Slow;
+
/// HasVectorUAMem - True if SIMD operations can have unaligned memory
/// operands. This may require setting a feature bit in the processor.
bool HasVectorUAMem;
@@ -374,6 +377,7 @@ public:
bool isBTMemSlow() const { return IsBTMemSlow; }
bool isSHLDSlow() const { return IsSHLDSlow; }
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+ bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
bool hasVectorUAMem() const { return HasVectorUAMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
Removed: llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll?rev=222543&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2012-05-19-avx2-store.ll (removed)
@@ -1,13 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
-
-define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
-entry:
- ; CHECK: vmovaps
- ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
- ; CHECK: vmovups
- %A = load <4 x i32>* %Ap
- %B = load <4 x i32>* %Bp
- %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- store <8 x i32> %Z, <8 x i32>* %P, align 16
- ret void
-}
Added: llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll?rev=222544&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll (added)
+++ llvm/trunk/test/CodeGen/X86/unaligned-32-byte-memops.ll Fri Nov 21 11:40:04 2014
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
+; because that is slower than two 16-byte loads.
+; Other AVX-capable chips don't have that problem.
+
+define <8 x float> @load32bytes(<8 x float>* %Ap) {
+ ; CHECK-LABEL: load32bytes
+
+ ; SANDYB: vmovaps
+ ; SANDYB: vinsertf128
+ ; SANDYB: retq
+
+ ; BTVER2: vmovups
+ ; BTVER2: retq
+
+ ; HASWELL: vmovups
+ ; HASWELL: retq
+
+ %A = load <8 x float>* %Ap, align 16
+ ret <8 x float> %A
+}
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
+; because that is slowerthan two 16-byte stores.
+; Other AVX-capable chips don't have that problem.
+
+define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
+ ; CHECK-LABEL: store32bytes
+
+ ; SANDYB: vextractf128
+ ; SANDYB: vmovaps
+ ; SANDYB: retq
+
+ ; BTVER2: vmovups
+ ; BTVER2: retq
+
+ ; HASWELL: vmovups
+ ; HASWELL: retq
+
+ store <8 x float> %A, <8 x float>* %P, align 16
+ ret void
+}
More information about the llvm-commits
mailing list