[llvm] d775fc3 - [InstCombine] Generate better code for std::bit_floor from libstdc++
Kazu Hirata via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 15 11:32:46 PDT 2023
Author: Kazu Hirata
Date: 2023-04-15T11:32:33-07:00
New Revision: d775fc390d3c78cc81872e276c4b1314f19af577
URL: https://github.com/llvm/llvm-project/commit/d775fc390d3c78cc81872e276c4b1314f19af577
DIFF: https://github.com/llvm/llvm-project/commit/d775fc390d3c78cc81872e276c4b1314f19af577.diff
LOG: [InstCombine] Generate better code for std::bit_floor from libstdc++
Without this patch, std::bit_floor<uint32_t> in libstdc++ is compiled
as:
%eq0 = icmp eq i32 %x, 0
%lshr = lshr i32 %x, 1
%ctlz = tail call i32 @llvm.ctlz.i32(i32 %lshr, i1 false)
%sub = sub i32 32, %ctlz
%shl = shl i32 1, %sub
%sel = select i1 %eq0, i32 0, i32 %shl
With this patch:
%eq0 = icmp eq i32 %x, 0
%ctlz = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
%lshr = lshr i32 -2147483648, %1
%sel = select i1 %eq0, i32 0, i32 %lshr
This patch recognizes the specific pattern emitted for std::bit_floor
in libstdc++.
https://alive2.llvm.org/ce/z/piMdFX
This patch fixes:
https://github.com/llvm/llvm-project/issues/61183
Differential Revision: https://reviews.llvm.org/D145890
Added:
Modified:
llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
llvm/test/Transforms/InstCombine/bit_floor.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3d1dbdd6270d5..0a746e25b31a4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3291,6 +3291,79 @@ static Instruction *foldBitCeil(SelectInst &SI, IRBuilderBase &Builder) {
Masked);
}
+// Transform:
+//
+// 1 << (C - ctlz(X >> 1))
+//
+// into
+//
+// (1 << (C - 1)) >> ctlz(X)
+//
+// The caller must guarantee that X is nonzero.
+//
+// TODO: Relax the requirement that X be nonzero. We just need to require X to
+// be nonzero or the second argument of CTLZ to be true (that is, returning
+// poison on zero).
+static Instruction *foldBitFloorNonzero(Value *N, Value *X,
+ InstCombiner::BuilderTy &Builder) {
+ Type *NType = N->getType();
+ unsigned BitWidth = NType->getScalarSizeInBits();
+
+ // Match C - ctlz(X >> 1), where C is in (0, BitWidth].
+ // TODO: Handle C in [0, BitWidth] (with 0 included in the range), in which
+ // case 1 << C - ctlz(X >> 1) is equivalent to
+ // (1 << ((C - 1) & (BitWidth - 1))) >> ctlz(X).
+ const APInt *C = nullptr;
+ Value *CTLZ;
+ if (!match(N, m_OneUse(m_Shl(m_One(),
+ m_OneUse(m_Sub(m_APInt(C), m_Value(CTLZ)))))) ||
+ !(C->ugt(0) && C->ule(BitWidth)) ||
+ !match(CTLZ, m_OneUse(m_Intrinsic<Intrinsic::ctlz>(
+ m_OneUse(m_LShr(m_Specific(X), m_One())), m_Zero()))))
+ return nullptr;
+
+ APInt ShiftedBit = APInt::getOneBitSet(BitWidth, C->getZExtValue() - 1);
+
+ // Build ShiftedBit >> CTLZ.
+ Value *NewCTLZ =
+ Builder.CreateIntrinsic(Intrinsic::ctlz, {CTLZ->getType()},
+ {X, cast<Instruction>(CTLZ)->getOperand(1)});
+ auto *Shift = cast<Instruction>(
+ Builder.CreateLShr(ConstantInt::get(NType, ShiftedBit), NewCTLZ));
+ Shift->setIsExact();
+ return Shift;
+}
+
+// Transform:
+//
+// X == 0 ? 0 : (1 << (C1 - ctlz(X >> 1)))
+//
+// into
+//
+// X == 0 ? 0 : (C2 >> ctlz(X))
+//
+// where C2 is computed by foldBitFloorNonzero based on C1. The caller is
+// responsible for replacing one of the select operands.
+static Instruction *foldBitFloor(SelectInst &SI,
+ InstCombiner::BuilderTy &Builder) {
+ Value *TrueVal = SI.getTrueValue();
+ Value *FalseVal = SI.getFalseValue();
+
+ ICmpInst::Predicate Pred;
+ Value *Cond0;
+ if (!match(SI.getCondition(), m_ICmp(Pred, m_Value(Cond0), m_Zero())) ||
+ !ICmpInst::isEquality(Pred))
+ return nullptr;
+
+ if (Pred == ICmpInst::ICMP_NE)
+ std::swap(TrueVal, FalseVal);
+
+ if (!match(TrueVal, m_Zero()))
+ return nullptr;
+
+ return foldBitFloorNonzero(FalseVal, Cond0, Builder);
+}
+
Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
Value *CondVal = SI.getCondition();
Value *TrueVal = SI.getTrueValue();
@@ -3721,5 +3794,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
if (Instruction *I = foldBitCeil(SI, Builder))
return I;
+ if (Instruction *I = foldBitFloor(SI, Builder))
+ return replaceOperand(SI, match(SI.getTrueValue(), m_Zero()) ? 2 : 1, I);
+
return nullptr;
}
diff --git a/llvm/test/Transforms/InstCombine/bit_floor.ll b/llvm/test/Transforms/InstCombine/bit_floor.ll
index 9daa8eee8969c..f9452b27e6c30 100644
--- a/llvm/test/Transforms/InstCombine/bit_floor.ll
+++ b/llvm/test/Transforms/InstCombine/bit_floor.ll
@@ -4,11 +4,9 @@
define i32 @bit_floor_32(i32 %x) {
; CHECK-LABEL: @bit_floor_32(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
-; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X]], i1 false), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = lshr exact i32 -2147483648, [[TMP1]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[TMP2]]
; CHECK-NEXT: ret i32 [[SEL]]
;
%eq0 = icmp eq i32 %x, 0
@@ -23,11 +21,9 @@ define i32 @bit_floor_32(i32 %x) {
define i64 @bit_floor_64(i64 %x) {
; CHECK-LABEL: @bit_floor_64(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i64 [[X:%.*]], 0
-; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[LSHR]], i1 false), !range [[RNG1:![0-9]+]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i64 64, [[CTLZ]]
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 1, [[SUB]]
-; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i64 0, i64 [[SHL]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[X]], i1 false), !range [[RNG1:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = lshr exact i64 -9223372036854775808, [[TMP1]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i64 0, i64 [[TMP2]]
; CHECK-NEXT: ret i64 [[SEL]]
;
%eq0 = icmp eq i64 %x, 0
@@ -43,11 +39,9 @@ define i64 @bit_floor_64(i64 %x) {
define i32 @bit_floor_commuted_operands(i32 %x) {
; CHECK-LABEL: @bit_floor_commuted_operands(
; CHECK-NEXT: [[NE0_NOT:%.*]] = icmp eq i32 [[X:%.*]], 0
-; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
-; CHECK-NEXT: [[SEL:%.*]] = select i1 [[NE0_NOT]], i32 0, i32 [[SHL]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[TMP2:%.*]] = lshr exact i32 -2147483648, [[TMP1]]
+; CHECK-NEXT: [[SEL:%.*]] = select i1 [[NE0_NOT]], i32 0, i32 [[TMP2]]
; CHECK-NEXT: ret i32 [[SEL]]
;
%ne0 = icmp ne i32 %x, 0
@@ -64,7 +58,7 @@ define i32 @bit_floor_lshr_used_twice(i32 %x, ptr %p) {
; CHECK-LABEL: @bit_floor_lshr_used_twice(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG2:![0-9]+]]
; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
@@ -86,7 +80,7 @@ define i32 @bit_floor_ctlz_used_twice(i32 %x, ptr %p) {
; CHECK-LABEL: @bit_floor_ctlz_used_twice(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG2]]
; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
@@ -108,7 +102,7 @@ define i32 @bit_floor_sub_used_twice(i32 %x, ptr %p) {
; CHECK-LABEL: @bit_floor_sub_used_twice(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG2]]
; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
@@ -130,7 +124,7 @@ define i32 @bit_floor_shl_used_twice(i32 %x, ptr %p) {
; CHECK-LABEL: @bit_floor_shl_used_twice(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq i32 [[X:%.*]], 0
; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[X]], 1
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[CTLZ:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG2]]
; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw i32 32, [[CTLZ]]
; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, [[SUB]]
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[EQ0]], i32 0, i32 [[SHL]]
@@ -151,11 +145,9 @@ define i32 @bit_floor_shl_used_twice(i32 %x, ptr %p) {
define <4 x i32> @bit_floor_v4i32(<4 x i32> %x) {
; CHECK-LABEL: @bit_floor_v4i32(
; CHECK-NEXT: [[EQ0:%.*]] = icmp eq <4 x i32> [[X:%.*]], zeroinitializer
-; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[CTLZ:%.*]] = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[LSHR]], i1 false), !range [[RNG0]]
-; CHECK-NEXT: [[SUB:%.*]] = sub nuw nsw <4 x i32> <i32 32, i32 32, i32 32, i32 32>, [[CTLZ]]
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[SUB]]
-; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[EQ0]], <4 x i32> zeroinitializer, <4 x i32> [[SHL]]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[X]], i1 false), !range [[RNG0]]
+; CHECK-NEXT: [[TMP2:%.*]] = lshr exact <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, [[TMP1]]
+; CHECK-NEXT: [[SEL:%.*]] = select <4 x i1> [[EQ0]], <4 x i32> zeroinitializer, <4 x i32> [[TMP2]]
; CHECK-NEXT: ret <4 x i32> [[SEL]]
;
%eq0 = icmp eq <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
More information about the llvm-commits
mailing list