[llvm] [DAGCombine] Fold `ctlz_zero_undef(X << C) -> ctlz_zero_undef(X) - C` (PR #100932)
Yingwei Zheng via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 28 06:25:23 PDT 2024
https://github.com/dtcxzyw created https://github.com/llvm/llvm-project/pull/100932
Alive2: https://alive2.llvm.org/ce/z/Dv26cE
Fixes a codegen regression introduced by https://github.com/llvm/llvm-project/commit/69192e0193e60c169c7776f444362dffba31eb7d.
Closes https://github.com/dtcxzyw/llvm-codegen-benchmark/issues/75.
>From 0cae58b573f8452c459072b7b9110261eba408e3 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 28 Jul 2024 20:54:53 +0800
Subject: [PATCH 1/2] [RISCV] Add pre-commit tests. NFC.
---
llvm/test/CodeGen/RISCV/rv32zbb.ll | 48 ++++++++++++++++++++++++++++++
llvm/test/CodeGen/RISCV/rv64zbb.ll | 48 ++++++++++++++++++++++++++++++
2 files changed, 96 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index cb9fc6c16333e..e3407e3da5165 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1417,3 +1417,51 @@ define i64 @orc_b_i64(i64 %a) {
%2 = mul nuw i64 %1, 255
ret i64 %2
}
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV32I-LABEL: count_activebits:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: andi a1, a0, 255
+; RV32I-NEXT: slli a0, a0, 24
+; RV32I-NEXT: srli a0, a0, 25
+; RV32I-NEXT: or a0, a1, a0
+; RV32I-NEXT: srli a1, a0, 2
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: or a0, a0, a1
+; RV32I-NEXT: not a0, a0
+; RV32I-NEXT: srli a1, a0, 1
+; RV32I-NEXT: lui a2, 5
+; RV32I-NEXT: addi a2, a2, 1365
+; RV32I-NEXT: and a1, a1, a2
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: lui a1, 3
+; RV32I-NEXT: addi a1, a1, 819
+; RV32I-NEXT: and a2, a0, a1
+; RV32I-NEXT: srli a0, a0, 2
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: add a0, a2, a0
+; RV32I-NEXT: srli a1, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: andi a1, a0, 15
+; RV32I-NEXT: slli a0, a0, 20
+; RV32I-NEXT: srli a0, a0, 28
+; RV32I-NEXT: li a2, 16
+; RV32I-NEXT: sub a2, a2, a1
+; RV32I-NEXT: sub a0, a2, a0
+; RV32I-NEXT: ret
+;
+; RV32ZBB-LABEL: count_activebits:
+; RV32ZBB: # %bb.0: # %entry
+; RV32ZBB-NEXT: andi a0, a0, 255
+; RV32ZBB-NEXT: slli a0, a0, 16
+; RV32ZBB-NEXT: clz a0, a0
+; RV32ZBB-NEXT: li a1, 16
+; RV32ZBB-NEXT: sub a0, a1, a0
+; RV32ZBB-NEXT: ret
+entry:
+ %ext = and i16 %x, 255
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+ %sub = sub nuw nsw i16 16, %ctlz
+ ret i16 %sub
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 6c354cc1b446b..b8aa09831117b 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1560,3 +1560,51 @@ define i64 @orc_b_i64(i64 %a) {
%2 = mul nuw i64 %1, 255
ret i64 %2
}
+
+define i16 @count_activebits(i16 %x) nounwind {
+; RV64I-LABEL: count_activebits:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: andi a1, a0, 255
+; RV64I-NEXT: slli a0, a0, 56
+; RV64I-NEXT: srli a0, a0, 57
+; RV64I-NEXT: or a0, a1, a0
+; RV64I-NEXT: srli a1, a0, 2
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: or a0, a0, a1
+; RV64I-NEXT: not a0, a0
+; RV64I-NEXT: srli a1, a0, 1
+; RV64I-NEXT: lui a2, 5
+; RV64I-NEXT: addiw a2, a2, 1365
+; RV64I-NEXT: and a1, a1, a2
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: lui a1, 3
+; RV64I-NEXT: addiw a1, a1, 819
+; RV64I-NEXT: and a2, a0, a1
+; RV64I-NEXT: srli a0, a0, 2
+; RV64I-NEXT: and a0, a0, a1
+; RV64I-NEXT: add a0, a2, a0
+; RV64I-NEXT: srli a1, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: andi a1, a0, 15
+; RV64I-NEXT: slli a0, a0, 52
+; RV64I-NEXT: srli a0, a0, 60
+; RV64I-NEXT: li a2, 16
+; RV64I-NEXT: sub a2, a2, a1
+; RV64I-NEXT: sub a0, a2, a0
+; RV64I-NEXT: ret
+;
+; RV64ZBB-LABEL: count_activebits:
+; RV64ZBB: # %bb.0: # %entry
+; RV64ZBB-NEXT: andi a0, a0, 255
+; RV64ZBB-NEXT: slli a0, a0, 48
+; RV64ZBB-NEXT: clz a0, a0
+; RV64ZBB-NEXT: li a1, 16
+; RV64ZBB-NEXT: sub a0, a1, a0
+; RV64ZBB-NEXT: ret
+entry:
+ %ext = and i16 %x, 255
+ %ctlz = call i16 @llvm.ctlz.i16(i16 %ext, i1 true)
+ %sub = sub nuw nsw i16 16, %ctlz
+ ret i16 %sub
+}
>From fc9ee387e9a08f5d3d96f7f3b81b9e0deb5c5633 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Sun, 28 Jul 2024 21:03:47 +0800
Subject: [PATCH 2/2] [DAGCombine] Fold `ctlz(X << C) -> ctlz(X) - C`
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 12 ++++++++++
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 22 +++++++++----------
llvm/test/CodeGen/RISCV/rv32zbb.ll | 3 +--
llvm/test/CodeGen/RISCV/rv64zbb.ll | 3 +--
4 files changed, 24 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 060e66175d965..a446643f52052 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11133,6 +11133,18 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
if (SDValue C =
DAG.FoldConstantArithmetic(ISD::CTLZ_ZERO_UNDEF, DL, VT, {N0}))
return C;
+
+ // Fold ctlz_zero_undef(X << C) --> ctlz_zero_undef(X) - C
+ SDValue A;
+ APInt C;
+ if (sd_match(N0, m_Shl(m_Value(A), m_ConstInt(C))) &&
+ DAG.computeKnownBits(A).countMinLeadingZeros() >= C.getLimitedValue()) {
+ SDValue NegC =
+ DAG.getConstant(-C.zextOrTrunc(VT.getScalarSizeInBits()), DL, VT);
+ return DAG.getNode(ISD::ADD, DL, VT,
+ DAG.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, A), NegC);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2168e7fe1dd28..7b2e5c8f6cca4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -620,7 +620,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -629,11 +629,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -24(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -724,7 +723,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -733,11 +732,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -16(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index e3407e3da5165..4478111c35f08 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -1454,9 +1454,8 @@ define i16 @count_activebits(i16 %x) nounwind {
; RV32ZBB-LABEL: count_activebits:
; RV32ZBB: # %bb.0: # %entry
; RV32ZBB-NEXT: andi a0, a0, 255
-; RV32ZBB-NEXT: slli a0, a0, 16
; RV32ZBB-NEXT: clz a0, a0
-; RV32ZBB-NEXT: li a1, 16
+; RV32ZBB-NEXT: li a1, 32
; RV32ZBB-NEXT: sub a0, a1, a0
; RV32ZBB-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index b8aa09831117b..3da9ee958221e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1597,9 +1597,8 @@ define i16 @count_activebits(i16 %x) nounwind {
; RV64ZBB-LABEL: count_activebits:
; RV64ZBB: # %bb.0: # %entry
; RV64ZBB-NEXT: andi a0, a0, 255
-; RV64ZBB-NEXT: slli a0, a0, 48
; RV64ZBB-NEXT: clz a0, a0
-; RV64ZBB-NEXT: li a1, 16
+; RV64ZBB-NEXT: li a1, 64
; RV64ZBB-NEXT: sub a0, a1, a0
; RV64ZBB-NEXT: ret
entry:
More information about the llvm-commits
mailing list