Thu Nov 21 15:21:27 PST 2013

Author: kromanova
Date: Thu Nov 21 17:21:26 2013
New Revision: 195383

URL: http://llvm.org/viewvc/llvm-project?rev=195383&view=rev
Log:
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction.

AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) | (y >> (64 - c))) when we are not optimizing for size.

It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel.


Added:
    llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
    llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
    llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
    llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-var.ll
Modified:
    llvm/trunk/lib/Target/X86/X86.td
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86Subtarget.cpp
    llvm/trunk/lib/Target/X86/X86Subtarget.h

Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=195383&r1=195382&r2=195383&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Thu Nov 21 17:21:26 2013
@@ -73,6 +73,8 @@ def FeatureCMPXCHG16B : SubtargetFeature
                                       [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
                                         "IsUAMemFast", "true",
                                         "Fast unaligned memory access">;
@@ -268,46 +270,53 @@ def : ProcessorModel<"knl", HaswellModel
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
 def : Proc<"k6-3",            [Feature3DNow]>;
-def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem, 
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem, 
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"amdfam10",        [FeatureSSE4A,
                                Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureSlowBTMem]>;
+                               FeaturePOPCNT, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowSHLD]>;
 // Jaguar
 def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                                FeatureBMI, FeatureF16C, FeatureMOVBE,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
-                               FeatureFMA]>;
+                               FeaturePOPCNT, FeatureBMI, FeatureTBM,
+                               FeatureFMA, FeatureSlowSHLD]>;
 
 // Steamroller
 def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=195383&r1=195382&r2=195383&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Nov 21 17:21:26 2013
@@ -17892,6 +17892,18 @@ static SDValue PerformOrCombine(SDNode *
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ 
+  // SHLD/SHRD instructions have lower register pressure, but on some 
+  // platforms they have higher latency than the equivalent 
+  // series of shifts/or that would otherwise be generated. 
+  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+  // have higer latencies and we are not optimizing for size.  
+  if (!OptForSize && Subtarget->isSHLDSlow())
+    return SDValue();
+
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=195383&r1=195382&r2=195383&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Thu Nov 21 17:21:26 2013
@@ -263,6 +263,15 @@ void X86Subtarget::AutoDetectSubtargetFe
       ToggleFeature(X86::FeatureSlowBTMem);
     }
 
+    // Determine if SHLD/SHRD instructions have higher latency then the
+    // equivalent series of shifts/or instructions. 
+    // FIXME: Add Intel's processors that have SHLD instructions with very
+    // poor latency. 
+    if (IsAMD) {
+      IsSHLDSlow = true;
+      ToggleFeature(X86::FeatureSlowSHLD);
+    }
+
     // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
     // memory access is fast. We hard code model numbers here because they
     // aren't strictly increasing for Intel chips it seems.
@@ -514,6 +523,7 @@ void X86Subtarget::initializeEnvironment
   HasPRFCHW = false;
   HasRDSEED = false;
   IsBTMemSlow = false;
+  IsSHLDSlow = false;
   IsUAMemFast = false;
   HasVectorUAMem = false;
   HasCmpxchg16b = false;

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=195383&r1=195382&r2=195383&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Thu Nov 21 17:21:26 2013
@@ -142,6 +142,9 @@ protected:
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
+  /// IsSHLDSlow - True if SHLD instructions are slow.
+  bool IsSHLDSlow;
+
   /// IsUAMemFast - True if unaligned memory access is fast.
   bool IsUAMemFast;
 
@@ -292,6 +295,7 @@ public:
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }

Added: llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-left.ll?rev=195383&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-left.ll (added)
+++ llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-left.ll Thu Nov 21 17:21:26 2013
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence 
+; of instructions with lower latencies instead of shld instruction.
+
+;uint64_t lshift1(uint64_t a, uint64_t b)
+;{
+;    return (a << 1) | (b >> 63);
+;}
+
+; CHECK:             lshift1:
+; CHECK:             addq    {{.*}},{{.*}}
+; CHECK-NEXT:        shrq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 1
+  %shr = lshr i64 %b, 63
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift2(uint64_t a, uint64_t b)
+;{
+;    return (a << 2) | (b >> 62);
+;}
+
+; CHECK:             lshift2:
+; CHECK:             shlq    $2, {{.*}}
+; CHECK-NEXT:        shrq    $62, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 2
+  %shr = lshr i64 %b, 62
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift7(uint64_t a, uint64_t b)
+;{
+;    return (a << 7) | (b >> 57);
+;}
+
+; CHECK:             lshift7:
+; CHECK:             shlq    $7, {{.*}}
+; CHECK-NEXT:        shrq    $57, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 7
+  %shr = lshr i64 %b, 57
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift63(uint64_t a, uint64_t b)
+;{
+;    return (a << 63) | (b >> 1);
+;}
+
+; CHECK:             lshift63:
+; CHECK:             shlq    $63, {{.*}}
+; CHECK-NEXT:        shrq    {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 63
+  %shr = lshr i64 %b, 1
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}

Added: llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-right.ll?rev=195383&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-right.ll (added)
+++ llvm/trunk/test/CodeGen/X86/x86-64-double-precision-shift-right.ll Thu Nov 21 17:21:26 2013
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence 
+; of instructions with lower latencies instead of shrd instruction.
+
+;uint64_t rshift1(uint64_t a, uint64_t b)
+;{
+;    return (a >> 1) | (b << 63);
+;}
+
+; CHECK:             rshift1:
+; CHECK:             shrq    {{.*}}
+; CHECK-NEXT:        shlq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 1
+  %2 = shl i64 %b, 63
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift2(uint64_t a, uint64_t b)
+;{
+;    return (a >> 2) | (b << 62);
+;}
+
+; CHECK:             rshift2:
+; CHECK:             shrq    $2, {{.*}}
+; CHECK-NEXT:        shlq    $62, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 2
+  %2 = shl i64 %b, 62
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift7(uint64_t a, uint64_t b)
+;{
+;    return (a >> 7) | (b << 57);
+;}
+
+; CHECK:             rshift7:
+; CHECK:             shrq    $7, {{.*}}
+; CHECK-NEXT:        shlq    $57, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 7
+  %2 = shl i64 %b, 57
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift63(uint64_t a, uint64_t b)
+;{
+;    return (a >> 63) | (b << 1);
+;}
+
+; CHECK:             rshift63:
+; CHECK:             shrq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+; CHECK-NEXT:        orq     {{.*}}, {{.*}}
+
+define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 63
+  %2 = shl i64 %b, 1
+  %3 = or i64 %2, %1
+  ret i64 %3
+}

Added: llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll?rev=195383&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll (added)
+++ llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll Thu Nov 21 17:21:26 2013
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+
+; clang -Oz -c test1.cpp -emit-llvm -S -o
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double 
+; precision shift instuctions.
+; uint64_t lshift10(uint64_t a, uint64_t b)
+; {
+;     return (a << 10) | (b >> 54);
+; }
+
+; Function Attrs: minsize nounwind optsize readnone uwtable
+define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
+entry:
+; CHECK:   shldq   $10
+  %shl = shl i64 %a, 10
+  %shr = lshr i64 %b, 54
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
+; clang -Os -c test2.cpp -emit-llvm -S
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift11(uint64_t a, uint64_t b)
+; {
+;     return (a << 11) | (b >> 53);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 {
+entry:
+; CHECK:   shldq   $11
+  %shl = shl i64 %a, 11
+  %shr = lshr i64 %b, 53
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; clang -O2 -c test2.cpp -emit-llvm -S
+; Verify that we do not generate shld insruction when we are not optimizing
+; for size for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift12(uint64_t a, uint64_t b)
+; {
+;     return (a << 12) | (b >> 52);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 {
+entry:
+; CHECK:       shlq    $12
+; CHECK-NEXT:  shrq    $52
+  %shl = shl i64 %a, 12
+  %shr = lshr i64 %b, 52
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+

Added: llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-var.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-var.ll?rev=195383&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-var.ll (added)
+++ llvm/trunk/test/CodeGen/X86/x86-64-double-shifts-var.ll Thu Nov 21 17:21:26 2013
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
+
+; Verify that for the X86_64 processors that are known to have poor latency 
+; double precision shift instructions we do not generate 'shld' or 'shrd'
+; instructions.
+
+;uint64_t lshift(uint64_t a, uint64_t b, int c)
+;{
+;    return (a << c) | (b >> (64-c));
+;}
+
+define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shld 
+  %sh_prom = zext i32 %c to i64
+  %shl = shl i64 %a, %sh_prom
+  %sub = sub nsw i32 64, %c
+  %sh_prom1 = zext i32 %sub to i64
+  %shr = lshr i64 %b, %sh_prom1
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t rshift(uint64_t a, uint64_t b, int c)
+;{
+;    return (a >> c) | (b << (64-c));
+;}
+
+define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shrd
+  %sh_prom = zext i32 %c to i64
+  %shr = lshr i64 %a, %sh_prom
+  %sub = sub nsw i32 64, %c
+  %sh_prom1 = zext i32 %sub to i64
+  %shl = shl i64 %b, %sh_prom1
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+