[llvm] [DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` (PR #100932)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 28 06:25:53 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Yingwei Zheng (dtcxzyw)
<details>
<summary>Changes</summary>
Alive2: https://alive2.llvm.org/ce/z/Dv26cE
Fixes a codegen regression introduced by https://github.com/llvm/llvm-project/commit/69192e0193e60c169c7776f444362dffba31eb7d.
Closes https://github.com/dtcxzyw/llvm-codegen-benchmark/issues/75.
---
Full diff: https://github.com/llvm/llvm-project/pull/100932.diff
4 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+12)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+10-12)
- (modified) llvm/test/CodeGen/RISCV/rv32zbb.ll (+47)
- (modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+47)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 060e66175d965..a446643f52052 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
if (SDValue C =
DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
return C;
+
+ // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
+ SDValue A;
+ APInt C;
+ if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&
+ DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue()) {
+ SDValue NegC =
+ DAG.getConstant(-C.zextOrTrunc(VT.getScalarSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::ADD, DL, VT,
+ DAG.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, A), NegC);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2168e7fe1dd28..7b2e5c8f6cca4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -620,7 +620,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -629,11 +629,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -24(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -724,7 +723,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -733,11 +732,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -16(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index cb9fc6c16333e..4478111c35f08 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1417,3 +1417,50 @@ define i64 @orc_b_i64(i64 %a) {
%2 = mul nuw i64 %1, 255
ret i64 %2
}
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV32I-LABEL: count_activebits:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: andi a1, a0, 255
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: srli a0, a0, 25
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 5
+; RV32I-NEXT: addi a2, a2, 1365
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lui a1, 3
+; RV32I-NEXT: addi a1, a1, 819
+; RV32I-NEXT: and a2, a0, a1
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: add a0, a2, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 15
+; RV32I-NEXT: slli a0, a0, 20
+; RV32I-NEXT: srli a0, a0, 28
+; RV32I-NEXT: li a2, 16
+; RV32I-NEXT: sub a2, a2, a1
+; RV32I-NEXT: sub a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: count_activebits:
+; RV32ZBB: # %bb.0: # %entry
+; RV32ZBB-NEXT: andi a0, a0, 255
+; RV32ZBB-NEXT: clz a0, a0
+; RV32ZBB-NEXT: li a1, 32
+; RV32ZBB-NEXT: sub a0, a1, a0
+; RV32ZBB-NEXT: ret
+entry:
+ %ext = and i16 %x, 255
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+ %sub = sub nuw nsw i16 16, %ctlz
+ ret i16 %sub
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 6c354cc1b446b..3da9ee958221e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1560,3 +1560,50 @@ define i64 @orc_b_i64(i64 %a) {
%2 = mul nuw i64 %1, 255
ret i64 %2
}
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV64I-LABEL: count_activebits:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: slli a0, a0, 56
+; RV64I-NEXT: srli a0, a0, 57
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 5
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: lui a1, 3
+; RV64I-NEXT: addiw a1, a1, 819
+; RV64I-NEXT: and a2, a0, a1
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: andi a1, a0, 15
+; RV64I-NEXT: slli a0, a0, 52
+; RV64I-NEXT: srli a0, a0, 60
+; RV64I-NEXT: li a2, 16
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: sub a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: count_activebits:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: andi a0, a0, 255
+; RV64ZBB-NEXT: clz a0, a0
+; RV64ZBB-NEXT: li a1, 64
+; RV64ZBB-NEXT: sub a0, a1, a0
+; RV64ZBB-NEXT: ret
+entry:
+ %ext = and i16 %x, 255
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+ %sub = sub nuw nsw i16 16, %ctlz
+ ret i16 %sub
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/100932
More information about the llvm-commits
mailing list