[llvm] cf8fadc - Match (xor TSize - 1, ctlz) to `bsr` instead of `lzcnt` + `xor`
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 6 12:16:41 PST 2023
Author: Noah Goldstein
Date: 2023-02-06T14:09:17-06:00
New Revision: cf8fadcf9b9362bff30e31cce06b516aa1156ce1
URL: https://github.com/llvm/llvm-project/commit/cf8fadcf9b9362bff30e31cce06b516aa1156ce1
DIFF: https://github.com/llvm/llvm-project/commit/cf8fadcf9b9362bff30e31cce06b516aa1156ce1.diff
LOG: Match (xor TSize - 1, ctlz) to `bsr` instead of `lzcnt` + `xor`
Was previously de-optimizating if -march supported lzcnt as there is
no reason to add the extra instruction.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D141464
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/clz.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c266c1872e6e..76d3a40d5688 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52265,6 +52265,63 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
}
+static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
+ "Invalid opcode for combing with CTLZ");
+ if (Subtarget.hasFastLZCNT())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
+ (VT != MVT::i64 || !Subtarget.is64Bit()))
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
+ N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)
+ return SDValue();
+
+ SDValue OpCTLZ;
+ SDValue OpSizeTM1;
+
+ if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
+ OpCTLZ = N1;
+ OpSizeTM1 = N0;
+ } else if (N->getOpcode() == ISD::SUB) {
+ return SDValue();
+ } else {
+ OpCTLZ = N0;
+ OpSizeTM1 = N1;
+ }
+
+ if (!OpCTLZ.hasOneUse())
+ return SDValue();
+ auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
+ if (!C)
+ return SDValue();
+
+ if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
+ return SDValue();
+ SDLoc DL(N);
+ EVT OpVT = VT;
+ SDValue Op = OpCTLZ.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
+ }
+
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
+
+ return Op;
+}
+
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -52292,6 +52349,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
return FPLogic;
+ if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -55177,6 +55237,9 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
Op1.getOperand(0));
}
+ if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll
index 19123a080eb5..92cbc1659024 100644
--- a/llvm/test/CodeGen/X86/clz.ll
+++ b/llvm/test/CodeGen/X86/clz.ll
@@ -968,7 +968,12 @@ define i32 @ctlz_i32_fold_cmov(i32 %n) {
; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
; the most significant bit, which is what 'bsr' does natively.
-; FIXME: We should probably select BSR instead of LZCNT in these circumstances.
+; NOTE: We intentionally don't select `bsr` when `fast-lzcnt` is
+; available. This is 1) because `bsr` has some drawbacks including a
+; dependency on dst, 2) very poor performance on some of the
+; `fast-lzcnt` processors, and 3) `lzcnt` runs at ALU latency/throughput
+; so `lzcnt` + `xor` has better throughput than even the 1-uop
+; (1c latency, 1c throughput) `bsr`.
define i32 @ctlz_bsr(i32 %n) {
; X86-LABEL: ctlz_bsr:
; X86: # %bb.0:
@@ -982,14 +987,12 @@ define i32 @ctlz_bsr(i32 %n) {
;
; X86-CLZ-LABEL: ctlz_bsr:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: xorl $31, %eax
+; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_bsr:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: lzcntl %edi, %eax
-; X64-CLZ-NEXT: xorl $31, %eax
+; X64-CLZ-NEXT: bsrl %edi, %eax
; X64-CLZ-NEXT: retq
;
; X64-FASTLZCNT-LABEL: ctlz_bsr:
@@ -1441,8 +1444,7 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) {
; X86-CLZ-LABEL: PR47603_zext:
; X86-CLZ: # %bb.0:
; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx
-; X86-CLZ-NEXT: xorl $31, %ecx
+; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %ecx
; X86-CLZ-NEXT: movsbl (%eax,%ecx), %eax
; X86-CLZ-NEXT: retl
;
@@ -1566,18 +1568,14 @@ define i8 @ctlz_xor7_i8_true(i8 %x) {
; X86-CLZ-LABEL: ctlz_xor7_i8_true:
; X86-CLZ: # %bb.0:
; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
-; X86-CLZ-NEXT: xorb $7, %al
+; X86-CLZ-NEXT: bsrl %eax, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_xor7_i8_true:
; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
-; X64-CLZ-NEXT: xorb $7, %al
+; X64-CLZ-NEXT: bsrl %eax, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
;
@@ -1692,16 +1690,12 @@ define i16 @ctlz_xor15_i16_true(i16 %x) {
;
; X86-CLZ-LABEL: ctlz_xor15_i16_true:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
-; X86-CLZ-NEXT: xorl $15, %eax
-; X86-CLZ-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-CLZ-NEXT: bsrw {{[0-9]+}}(%esp), %ax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_xor15_i16_true:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: lzcntw %di, %ax
-; X64-CLZ-NEXT: xorl $15, %eax
-; X64-CLZ-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-CLZ-NEXT: bsrw %di, %ax
; X64-CLZ-NEXT: retq
;
; X64-FASTLZCNT-LABEL: ctlz_xor15_i16_true:
@@ -1836,8 +1830,7 @@ define i64 @ctlz_xor63_i64_true(i64 %x) {
;
; X64-CLZ-LABEL: ctlz_xor63_i64_true:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: lzcntq %rdi, %rax
-; X64-CLZ-NEXT: xorq $63, %rax
+; X64-CLZ-NEXT: bsrq %rdi, %rax
; X64-CLZ-NEXT: retq
;
; X64-FASTLZCNT-LABEL: ctlz_xor63_i64_true:
More information about the llvm-commits
mailing list