[llvm] [DAGCombine] Count leading ones: refine post DAG/Type Legalisation if promotion (PR #102877)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 17 01:58:01 PDT 2024
https://github.com/v01dXYZ updated https://github.com/llvm/llvm-project/pull/102877
>From bcdeb900bcc17673c47a8496bf9b981d221a6f80 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Mon, 12 Aug 2024 13:14:57 +0200
Subject: [PATCH 1/3] [VP] Define Base Opcode for VP_CTLZ_ZERO_UNDEF
---
llvm/include/llvm/IR/VPIntrinsics.def | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index a4a1000d37259e..5366092d1740c5 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -269,6 +269,7 @@ VP_PROPERTY_FUNCTIONAL_INTRINSIC(ctlz)
VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ)
END_REGISTER_VP_SDNODE(VP_CTLZ)
BEGIN_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF, -1, vp_ctlz_zero_undef, 1, 2)
+VP_PROPERTY_FUNCTIONAL_SDOPC(CTLZ_ZERO_UNDEF)
END_REGISTER_VP_SDNODE(VP_CTLZ_ZERO_UNDEF)
END_REGISTER_VP_INTRINSIC(vp_ctlz)
>From ce49eca0e703cd909874d31d7cc3ddd0b351005b Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Mon, 12 Aug 2024 12:59:10 +0200
Subject: [PATCH 2/3] Count leading ones: test pre-commit
---
llvm/test/CodeGen/AArch64/ctlo.ll | 118 ++++++++++++++++++
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 165 +++++++++++++++++++++++++
2 files changed, 283 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/ctlo.ll
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 00000000000000..d2d0ca1ecf47f8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-LABEL: ctlo_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #255 // =0xff
+; CHECK-NEXT: bic w8, w8, w0
+; CHECK-NEXT: clz w8, w8
+; CHECK-NEXT: sub w0, w8, #24
+; CHECK-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+ ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-SD-LABEL: ctlo_i8_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #24
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+ ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-LABEL: ctlo_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #65535 // =0xffff
+; CHECK-NEXT: bic w8, w8, w0
+; CHECK-NEXT: clz w8, w8
+; CHECK-NEXT: sub w0, w8, #16
+; CHECK-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+ ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-SD-LABEL: ctlo_i16_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #16
+; CHECK-GI-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+ ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+ ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+ ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+ ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+ ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 58882525e55c4c..c0694072f518ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2624,6 +2624,171 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
}
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vclz.v v8, v8
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.ctlz(<vscale x 1 x i9> %va.not, i1 false)
+ ret <vscale x 1 x i9> %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
>From 4e159981e9a8904ac16e207b1330a5db990e00fe Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Mon, 12 Aug 2024 13:23:46 +0200
Subject: [PATCH 3/3] [DAGCombine] Count leading ones: refine post-legalisation
Detect and rewrite patterns created by DAG/Type Legalisation when CTLZ is used for
counting leading ones. Replace a SUB + CTLZ + ZERO_EXTEND with a CTLZ_ZERO_UNDEF + SHL.
The VP path is supported too.
DAG Legalisation Pattern:
(sub (ctlz (zeroextend (not Src)))
BitWidthDiff)
if BitWidthDiff == BitWidth(Node) - BitWidth(Src)
-->
(ctlz_zero_undef (not (shl (anyextend Src) BitWidthDiff)))
Type Legalisation Pattern:
(sub (ctlz (and (xor Src XorMask)
AndMask))
BitWidthDiff)
if AndMask has only trailing ones
and MaskBitWidth(AndMask) == BitWidth(Node) - BitWidthDiff
and XorMask has more trailing ones than AndMask
-->
(ctlz_zero_undef (not (shl Src BitWidthDiff)))
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 50 +++++++++++++++++++
llvm/test/CodeGen/AArch64/ctlo.ll | 42 ++++++++++------
.../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 24 +++------
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 34 +++----------
llvm/test/CodeGen/X86/ctlo.ll | 14 +++---
5 files changed, 100 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8064cc7963b7ac..ee1816ec57c30b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3755,6 +3755,51 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
return SDValue();
}
+template <class MatchContextClass>
+static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
+ const SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0.getValueType();
+ unsigned BitWidth = VT.getScalarSizeInBits();
+
+ MatchContextClass Matcher(DAG, DAG.getTargetLoweringInfo(), N);
+
+ APInt AndMask;
+ APInt XorMask;
+ APInt BitWidthDiff;
+
+ SDValue CtlzOp;
+ SDValue Src;
+
+ if (!sd_context_match(
+ N, Matcher,
+ m_Sub(m_Node(ISD::CTLZ, m_Value(CtlzOp)), m_ConstInt(BitWidthDiff))))
+ return SDValue();
+
+ if (sd_context_match(CtlzOp, Matcher, m_ZExt(m_Not(m_Value(Src))))) {
+ // (sub (ctlz (zero_extend (not Op)) BitWidthDiff))
+ if ((BitWidth - Src.getValueType().getScalarSizeInBits()) != BitWidthDiff)
+ return SDValue();
+
+ Src = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Src);
+ } else if (sd_context_match(CtlzOp, Matcher,
+ m_And(m_Xor(m_Value(Src), m_ConstInt(XorMask)),
+ m_ConstInt(AndMask)))) {
+ // (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
+ unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
+ if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
+ return SDValue();
+ } else
+ return SDValue();
+
+ SDValue ShiftConst = DAG.getShiftAmountConstant(BitWidthDiff, VT, DL);
+ SDValue LShift = Matcher.getNode(ISD::SHL, DL, VT, Src, ShiftConst);
+ SDValue Not =
+ Matcher.getNode(ISD::XOR, DL, VT, LShift, DAG.getAllOnesConstant(DL, VT));
+
+ return Matcher.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, Not);
+}
+
// Since it may not be valid to emit a fold to zero for vector initializers
// check if we can before folding.
static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
@@ -3779,6 +3824,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
return N;
};
+ if (SDValue V = foldSubCtlzNot<EmptyMatchContext>(N, DAG))
+ return V;
+
// fold (sub x, x) -> 0
// FIXME: Refactor this and xor and other similar operations together.
if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
@@ -26900,6 +26948,8 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) {
return visitVP_SELECT(N);
case ISD::VP_MUL:
return visitMUL<VPMatchContext>(N);
+ case ISD::VP_SUB:
+ return foldSubCtlzNot<VPMatchContext>(N, DAG);
default:
break;
}
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
index d2d0ca1ecf47f8..e047545b38cfa5 100644
--- a/llvm/test/CodeGen/AArch64/ctlo.ll
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -8,13 +8,20 @@ declare i32 @llvm.ctlz.i32(i32, i1)
declare i64 @llvm.ctlz.i64(i64, i1)
define i8 @ctlo_i8(i8 %x) {
-; CHECK-LABEL: ctlo_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #255 // =0xff
-; CHECK-NEXT: bic w8, w8, w0
-; CHECK-NEXT: clz w8, w8
-; CHECK-NEXT: sub w0, w8, #24
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ctlo_i8:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #255 // =0xff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #24
+; CHECK-GI-NEXT: ret
%tmp1 = xor i8 %x, -1
%tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
ret i8 %tmp2
@@ -41,13 +48,20 @@ define i8 @ctlo_i8_undef(i8 %x) {
}
define i16 @ctlo_i16(i16 %x) {
-; CHECK-LABEL: ctlo_i16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #65535 // =0xffff
-; CHECK-NEXT: bic w8, w8, w0
-; CHECK-NEXT: clz w8, w8
-; CHECK-NEXT: sub w0, w8, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ctlo_i16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-SD-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT: bic w8, w8, w0
+; CHECK-GI-NEXT: clz w8, w8
+; CHECK-GI-NEXT: sub w0, w8, #16
+; CHECK-GI-NEXT: ret
%tmp1 = xor i16 %x, -1
%tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
ret i16 %tmp2
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f3236..e993ecfcdf3b81 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i8:
; LA32: # %bb.0:
-; LA32-NEXT: ori $a1, $zero, 255
-; LA32-NEXT: andn $a0, $a1, $a0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -24
+; LA32-NEXT: slli.w $a0, $a0, 24
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i8:
; LA64: # %bb.0:
-; LA64-NEXT: ori $a1, $zero, 255
-; LA64-NEXT: andn $a0, $a1, $a0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -56
+; LA64-NEXT: slli.d $a0, $a0, 56
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i8 %a, -1
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i16:
; LA32: # %bb.0:
-; LA32-NEXT: nor $a0, $a0, $zero
-; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -16
+; LA32-NEXT: slli.w $a0, $a0, 16
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i16:
; LA64: # %bb.0:
-; LA64-NEXT: nor $a0, $a0, $zero
-; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -48
+; LA64-NEXT: slli.d $a0, $a0, 48
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i16 %a, -1
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index c0694072f518ef..9ea1394a1dd2c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2627,11 +2627,9 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ctlo_nxv1i9:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 511
-; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT: vxor.vx v8, v8, a1
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
@@ -2639,22 +2637,14 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
; CHECK-NEXT: li a0, 142
; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT: li a0, 7
-; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
; CHECK-ZVBB: # %bb.0:
-; CHECK-ZVBB-NEXT: li a1, 511
-; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT: li a0, 7
-; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-ZVBB-NEXT: ret
%va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
@@ -2694,10 +2684,9 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 511
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
-; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
@@ -2705,21 +2694,14 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
; CHECK-NEXT: li a0, 142
; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
-; CHECK-NEXT: li a0, 7
-; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
; CHECK-ZVBB: # %bb.0:
-; CHECK-ZVBB-NEXT: li a1, 511
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
-; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT: li a0, 7
-; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-ZVBB-NEXT: ret
%va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index bb80279e28f3d3..e4f7c666f9ea37 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
%tmp1 = xor i8 %x, -1
More information about the llvm-commits
mailing list