[llvm] r323096 - Break false dependencies for POPCNT, LZCNT, TZCNT

Marina Yatsina via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 22 02:07:01 PST 2018


Author: myatsina
Date: Mon Jan 22 02:07:01 2018
New Revision: 323096

URL: http://llvm.org/viewvc/llvm-project?rev=323096&view=rev
Log:
Break false dependencies for POPCNT, LZCNT, TZCNT

Add POPCNT, LZCNT, TZCNT to the list of instructions that have false dependency.
Add a test to make sure BreakFalseDeps breaks the dependencies for these instructions.
Update affected tests.

This fixes bugzilla https://bugs.llvm.org/show_bug.cgi?id=33869

This is the final of multiple patches that fix this bugzilla.
Most of the patches are intended at refactoring the existent code.

Reviews of the refactoring done to enable this change:
https://reviews.llvm.org/D40330
https://reviews.llvm.org/D40331
https://reviews.llvm.org/D40332
https://reviews.llvm.org/D40333

Differential Revision: https://reviews.llvm.org/D40334

Change-Id: If95cbf1a3f5c7dccff8f1b22ecb397542147303d

Added:
    llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll
Modified:
    llvm/trunk/lib/Target/X86/X86.td
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/lib/Target/X86/X86Subtarget.cpp
    llvm/trunk/lib/Target/X86/X86Subtarget.h

Modified: llvm/trunk/lib/Target/X86/X86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86.td?rev=323096&r1=323095&r2=323096&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86.td (original)
+++ llvm/trunk/lib/Target/X86/X86.td Mon Jan 22 02:07:01 2018
@@ -268,6 +268,12 @@ def FeatureSlowIncDec : SubtargetFeature
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
+def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt",
+                                     "HasPOPCNTFalseDeps", "true",
+                                     "POPCNT has a false dependency on dest register">;
+def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
+                                     "HasLZCNTFalseDeps", "true",
+                                     "LZCNT/TZCNT have a false dependency on dest register">;
 // On recent X86 (port bound) processors, its preferable to combine to a single shuffle
 // using a variable mask over multiple fixed shuffles.
 def FeatureFastVariableShuffle
@@ -619,7 +625,8 @@ def SNBFeatures : ProcessorFeatures<[],
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
                                                SNBFeatures.Value, [
-  FeatureSlowUAMem32
+  FeatureSlowUAMem32,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : SandyBridgeProc<"sandybridge">;
 def : SandyBridgeProc<"corei7-avx">; // Legacy alias.
@@ -632,7 +639,8 @@ def IVBFeatures : ProcessorFeatures<SNBF
 
 class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
                                              IVBFeatures.Value, [
-  FeatureSlowUAMem32
+  FeatureSlowUAMem32,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : IvyBridgeProc<"ivybridge">;
 def : IvyBridgeProc<"core-avx-i">; // Legacy alias.
@@ -650,7 +658,9 @@ def HSWFeatures : ProcessorFeatures<IVBF
 
 class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
                                            HSWFeatures.Value, [
-  ProcIntelHSW
+  ProcIntelHSW,
+  FeaturePOPCNTFalseDeps,
+  FeatureLZCNTFalseDeps
 ]>;
 def : HaswellProc<"haswell">;
 def : HaswellProc<"core-avx2">; // Legacy alias.
@@ -662,7 +672,9 @@ def BDWFeatures : ProcessorFeatures<HSWF
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
                                              BDWFeatures.Value, [
-  ProcIntelBDW
+  ProcIntelBDW,
+  FeaturePOPCNTFalseDeps,
+  FeatureLZCNTFalseDeps
 ]>;
 def : BroadwellProc<"broadwell">;
 
@@ -679,7 +691,8 @@ def SKLFeatures : ProcessorFeatures<BDWF
 class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
                                                  SKLFeatures.Value, [
   ProcIntelSKL,
-  FeatureHasFastGather
+  FeatureHasFastGather,
+  FeaturePOPCNTFalseDeps
 ]>;
 def : SkylakeClientProc<"skylake">;
 

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=323096&r1=323095&r2=323096&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon Jan 22 02:07:01 2018
@@ -8061,7 +8061,8 @@ bool X86InstrInfo::expandPostRAPseudo(Ma
 ///
 /// FIXME: This should be turned into a TSFlags.
 ///
-static bool hasPartialRegUpdate(unsigned Opcode) {
+static bool hasPartialRegUpdate(unsigned Opcode,
+                                const X86Subtarget &Subtarget) {
   switch (Opcode) {
   case X86::CVTSI2SSrr:
   case X86::CVTSI2SSrm:
@@ -8100,6 +8101,21 @@ static bool hasPartialRegUpdate(unsigned
   case X86::SQRTSDr_Int:
   case X86::SQRTSDm_Int:
     return true;
+  // GPR
+  case X86::POPCNT32rm:
+  case X86::POPCNT32rr:
+  case X86::POPCNT64rm:
+  case X86::POPCNT64rr:
+    return Subtarget.hasPOPCNTFalseDeps();
+  case X86::LZCNT32rm:
+  case X86::LZCNT32rr:
+  case X86::LZCNT64rm:
+  case X86::LZCNT64rr:
+  case X86::TZCNT32rm:
+  case X86::TZCNT32rr:
+  case X86::TZCNT64rm:
+  case X86::TZCNT64rr:
+    return Subtarget.hasLZCNTFalseDeps();
   }
 
   return false;
@@ -8110,7 +8126,7 @@ static bool hasPartialRegUpdate(unsigned
 unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
     const TargetRegisterInfo *TRI) const {
-  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
     return 0;
 
   // If MI is marked as reading Reg, the partial register update is wanted.
@@ -8316,6 +8332,20 @@ void X86InstrInfo::breakPartialRegDepend
         .addReg(XReg, RegState::Undef)
         .addReg(Reg, RegState::ImplicitDefine);
     MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::GR64RegClass.contains(Reg)) {
+    // Using XOR32rr because it has shorter encoding and zeros up the upper bits
+    // as well.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
+        .addReg(XReg, RegState::Undef)
+        .addReg(XReg, RegState::Undef)
+        .addReg(Reg, RegState::ImplicitDefine);
+    MI.addRegisterKilled(Reg, TRI, true);
+  } else if (X86::GR32RegClass.contains(Reg)) {
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
+    MI.addRegisterKilled(Reg, TRI, true);
   }
 }
 
@@ -8487,7 +8517,8 @@ MachineInstr *X86InstrInfo::foldMemoryOp
 
   // Avoid partial register update stalls unless optimizing for size.
   // TODO: we should block undef reg update as well.
-  if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+  if (!MF.getFunction().optForSize() &&
+      hasPartialRegUpdate(MI.getOpcode(), Subtarget))
     return nullptr;
 
   unsigned NumOps = MI.getDesc().getNumOperands();
@@ -8656,7 +8687,8 @@ X86InstrInfo::foldMemoryOperandImpl(Mach
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   // TODO: we should block undef reg update as well.
-  if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+  if (!MF.getFunction().optForSize() &&
+      hasPartialRegUpdate(MI.getOpcode(), Subtarget))
     return nullptr;
 
   // Don't fold subreg spills, or reloads that use a high subreg.
@@ -8855,7 +8887,8 @@ MachineInstr *X86InstrInfo::foldMemoryOp
 
   // Avoid partial register update stalls unless optimizing for size.
   // TODO: we should block undef reg update as well.
-  if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+  if (!MF.getFunction().optForSize() &&
+      hasPartialRegUpdate(MI.getOpcode(), Subtarget))
     return nullptr;
 
   // Determine the alignment of the load.

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.cpp?rev=323096&r1=323095&r2=323096&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp Mon Jan 22 02:07:01 2018
@@ -329,6 +329,8 @@ void X86Subtarget::initializeEnvironment
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
+  HasPOPCNTFalseDeps = false;
+  HasLZCNTFalseDeps = false;
   HasFastVariableShuffle = false;
   HasFastPartialYMMorZMMWrite = false;
   HasFastGather = false;

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=323096&r1=323095&r2=323096&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Mon Jan 22 02:07:01 2018
@@ -232,6 +232,12 @@ protected:
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// True if POPCNT instruction has a false dependency on the destination register.
+  bool HasPOPCNTFalseDeps;
+
+  /// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
+  bool HasLZCNTFalseDeps;
+
   /// True if its preferable to combine to a single shuffle using a variable
   /// mask over multiple fixed shuffles.
   bool HasFastVariableShuffle;
@@ -557,6 +563,8 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
+  bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
   bool hasFastVariableShuffle() const {
     return HasFastVariableShuffle;
   }

Added: llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll?rev=323096&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll (added)
+++ llvm/trunk/test/CodeGen/X86/bitcnt-false-dep.ll Mon Jan 22 02:07:01 2018
@@ -0,0 +1,171 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=+lzcnt | FileCheck %s --check-prefix=HSW
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -mattr=+lzcnt | FileCheck %s --check-prefix=SKL
+
+; This tests a fix for bugzilla 33869 https://bugs.llvm.org/show_bug.cgi?id=33869
+
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @loopdep_popcnt32(i32* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i32, i32* %x
+  br label %loop
+loop:
+  %i = phi i32 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ]
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %j = tail call i32 @llvm.ctpop.i32(i32 %i)
+  %s2 = add i32 %s1, %j
+  %inc = add nsw i32 %i, 1
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %exitcond = icmp eq i32 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i32 %s2
+
+;HSW-LABEL:@loopdep_popcnt32
+;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
+;HSW-NEXT: popcntl {{.*}}, [[GPR0]]
+
+;SKL-LABEL:@loopdep_popcnt32
+;SKL: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
+;SKL-NEXT: popcntl {{.*}}, [[GPR0]]
+}
+
+define i64 @loopdep_popcnt64(i64* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i64, i64* %x
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %j = tail call i64 @llvm.ctpop.i64(i64 %i)
+  %s2 = add i64 %s1, %j
+  %inc = add nsw i64 %i, 1
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %exitcond = icmp eq i64 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i64 %s2
+
+;HSW-LABEL:@loopdep_popcnt64
+;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
+;HSW-NEXT: popcntq {{.*}}, %r[[GPR0]]
+
+;SKL-LABEL:@loopdep_popcnt64
+;SKL: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
+;SKL-NEXT: popcntq {{.*}}, %r[[GPR0]]
+}
+
+define i32 @loopdep_tzct32(i32* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i32, i32* %x
+  br label %loop
+loop:
+  %i = phi i32 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ]
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %j = call i32 @llvm.cttz.i32(i32 %i, i1 true)
+  %s2 = add i32 %s1, %j
+  %inc = add nsw i32 %i, 1
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %exitcond = icmp eq i32 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i32 %s2
+
+;HSW-LABEL:@loopdep_tzct32
+;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
+;HSW-NEXT: tzcntl {{.*}}, [[GPR0]]
+
+; This false dependecy issue was fixed in Skylake
+;SKL-LABEL:@loopdep_tzct32
+;SKL-NOT: xor
+;SKL: tzcntl
+}
+
+define i64 @loopdep_tzct64(i64* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i64, i64* %x
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %j = tail call i64 @llvm.cttz.i64(i64 %i, i1 true)
+  %s2 = add i64 %s1, %j
+  %inc = add nsw i64 %i, 1
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %exitcond = icmp eq i64 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i64 %s2
+
+;HSW-LABEL:@loopdep_tzct64
+;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
+;HSW-NEXT: tzcntq {{.*}}, %r[[GPR0]]
+
+; This false dependecy issue was fixed in Skylake
+;SKL-LABEL:@loopdep_tzct64
+;SKL-NOT: xor
+;SKL: tzcntq
+}
+
+define i32 @loopdep_lzct32(i32* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i32, i32* %x
+  br label %loop
+loop:
+  %i = phi i32 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i32 [ %vx, %entry ], [ %s2, %loop ]
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %j = call i32 @llvm.ctlz.i32(i32 %i, i1 true)
+  %s2 = add i32 %s1, %j
+  %inc = add nsw i32 %i, 1
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %exitcond = icmp eq i32 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i32 %s2
+
+;HSW-LABEL:@loopdep_lzct32
+;HSW: xorl [[GPR0:%e[a-d]x]], [[GPR0]]
+;HSW-NEXT: lzcntl {{.*}}, [[GPR0]]
+
+; This false dependecy issue was fixed in Skylake
+;SKL-LABEL:@loopdep_lzct32
+;SKL-NOT: xor
+;SKL: lzcntl
+}
+
+define i64 @loopdep_lzct64(i64* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i64, i64* %x
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %j = tail call i64 @llvm.ctlz.i64(i64 %i, i1 true)
+  %s2 = add i64 %s1, %j
+  %inc = add nsw i64 %i, 1
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+  %exitcond = icmp eq i64 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i64 %s2
+
+;HSW-LABEL:@loopdep_lzct64
+;HSW: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]]
+;HSW-NEXT: lzcntq {{.*}}, %r[[GPR0]]
+
+; This false dependecy issue was fixed in Skylake
+;SKL-LABEL:@loopdep_lzct64
+;SKL-NOT: xor
+;SKL: lzcntq
+}




More information about the llvm-commits mailing list