[llvm] r295697 - [X86] Use SHLD with both inputs from the same register to implement rotate on Sandy Bridge and later Intel CPUs
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 20 22:39:13 PST 2017
Author: ctopper
Date: Tue Feb 21 00:39:13 2017
New Revision: 295697
URL: http://llvm.org/viewvc/llvm-project?rev=295697&view=rev
Log:
[X86] Use SHLD with both inputs from the same register to implement rotate on Sandy Bridge and later Intel CPUs
Summary:
Sandy Bridge and later CPUs have better throughput using a SHLD to implement rotate versus the normal rotate instructions. Additionally it saves one uop and avoids a partial flag update dependency.
This patch implements this change on any Sandy Bridge or later processor without BMI2 instructions. With BMI2 we will use RORX as we currently do.
Reviewers: zvi
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D30181
Modified:
llvm/trunk/lib/Target/X86/X86.td
llvm/trunk/lib/Target/X86/X86InstrInfo.td
llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td
llvm/trunk/lib/Target/X86/X86Subtarget.cpp
llvm/trunk/lib/Target/X86/X86Subtarget.h
llvm/trunk/test/CodeGen/X86/rot32.ll
llvm/trunk/test/CodeGen/X86/rot64.ll
Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Tue Feb 21 00:39:13 2017
@@ -263,6 +263,15 @@ def FeatureFastLZCNT
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
+
+// Sandy Bridge and newer processors can use SHLD with the same source on both
+// inputs to implement rotate to avoid the partial flag update of the normal
+// rotate instructions.
+def FeatureFastSHLDRotate
+ : SubtargetFeature<
+ "fast-shld-rotate", "HasFastSHLDRotate", "true",
+ "SHLD can be used as a faster rotate">;
+
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
@@ -458,7 +467,8 @@ def SNBFeatures : ProcessorFeatures<[],
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
- FeatureFastScalarFSQRT
+ FeatureFastScalarFSQRT,
+ FeatureFastSHLDRotate
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.td?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.td Tue Feb 21 00:39:13 2017
@@ -897,6 +897,7 @@ def FavorMemIndirectCall : Predicate<"!
def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
//===----------------------------------------------------------------------===//
Modified: llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrShiftRotate.td Tue Feb 21 00:39:13 2017
@@ -846,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
} // Defs = [EFLAGS]
+// Sandy Bridge and newer Intel processors support faster rotates using
+// SHLD to avoid a partial flag update on the normal rotate instructions.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
+ def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+ (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+ def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+ (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+}
+
def ROT32L2R_imm8 : SDNodeXForm<imm, [{
// Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Tue Feb 21 00:39:13 2017
@@ -302,6 +302,7 @@ void X86Subtarget::initializeEnvironment
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
+ HasFastSHLDRotate = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Tue Feb 21 00:39:13 2017
@@ -229,6 +229,9 @@ protected:
/// True if LZCNT instruction is fast.
bool HasFastLZCNT;
+ /// True if SHLD based rotate is fast.
+ bool HasFastSHLDRotate;
+
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;
@@ -466,6 +469,7 @@ public:
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
+ bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
Modified: llvm/trunk/test/CodeGen/X86/rot32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/rot32.ll?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/rot32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/rot32.ll Tue Feb 21 00:39:13 2017
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
@@ -49,6 +50,8 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z)
entry:
; CHECK-LABEL: xfoo:
; CHECK: roll $7
+; SHLD-LABEL: xfoo:
+; SHLD: shldl $7
; BMI2-LABEL: xfoo:
; BMI2: rorxl $25
%0 = lshr i32 %x, 25
@@ -61,6 +64,8 @@ define i32 @xfoop(i32* %p) nounwind read
entry:
; CHECK-LABEL: xfoop:
; CHECK: roll $7
+; SHLD-LABEL: xfoop:
+; SHLD: shldl $7
; BMI2-LABEL: xfoop:
; BMI2: rorxl $25
%x = load i32, i32* %p
@@ -84,6 +89,8 @@ define i32 @xun(i32 %x, i32 %y, i32 %z)
entry:
; CHECK-LABEL: xun:
; CHECK: roll $25
+; SHLD-LABEL: xun:
+; SHLD: shldl $25
; BMI2-LABEL: xun:
; BMI2: rorxl $7
%0 = lshr i32 %x, 7
@@ -96,6 +103,8 @@ define i32 @xunp(i32* %p) nounwind readn
entry:
; CHECK-LABEL: xunp:
; CHECK: roll $25
+; shld-label: xunp:
+; shld: shldl $25
; BMI2-LABEL: xunp:
; BMI2: rorxl $7
%x = load i32, i32* %p
Modified: llvm/trunk/test/CodeGen/X86/rot64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/rot64.ll?rev=295697&r1=295696&r2=295697&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/rot64.ll (original)
+++ llvm/trunk/test/CodeGen/X86/rot64.ll Tue Feb 21 00:39:13 2017
@@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
@@ -49,6 +50,8 @@ define i64 @xfoo(i64 %x, i64 %y, i64 %z)
entry:
; CHECK-LABEL: xfoo:
; CHECK: rolq $7
+; SHLD-LABEL: xfoo:
+; SHLD: shldq $7
; BMI2-LABEL: xfoo:
; BMI2: rorxq $57
%0 = lshr i64 %x, 57
@@ -61,6 +64,8 @@ define i64 @xfoop(i64* %p) nounwind read
entry:
; CHECK-LABEL: xfoop:
; CHECK: rolq $7
+; SHLD-LABEL: xfoop:
+; SHLD: shldq $7
; BMI2-LABEL: xfoop:
; BMI2: rorxq $57
%x = load i64, i64* %p
@@ -84,6 +89,8 @@ define i64 @xun(i64 %x, i64 %y, i64 %z)
entry:
; CHECK-LABEL: xun:
; CHECK: rolq $57
+; SHLD-LABEL: xun:
+; SHLD: shldq $57
; BMI2-LABEL: xun:
; BMI2: rorxq $7
%0 = lshr i64 %x, 7
@@ -96,6 +103,8 @@ define i64 @xunp(i64* %p) nounwind readn
entry:
; CHECK-LABEL: xunp:
; CHECK: rolq $57
+; SHLD-LABEL: xunp:
+; SHLD: shldq $57
; BMI2-LABEL: xunp:
; BMI2: rorxq $7
%x = load i64, i64* %p
More information about the llvm-commits
mailing list