[llvm] Optimize count leading ones if promoted type (PR #99591)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 8 00:17:23 PDT 2024
https://github.com/v01dXYZ updated https://github.com/llvm/llvm-project/pull/99591
>From 3f8322e46609d40cafa5c24e904348d41caf9c6b Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 23 Jul 2024 15:36:19 +0200
Subject: [PATCH 1/5] [CodeGenPrepare] Do not despeculate count
leading/trailing ones if promotion
For count leading/trailing ones, ie (CTLZ/CTTZ (XOR Op -1)),
legalisation should be able to optimise this case when a promotion is
necessary.
Despeculation should not be applied in this case as it will separate
XOR and CTLZ/CTTZ in two different basic blocks. This is particularly
problematic with SelectionDAG.
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 20 +++
.../CodeGenPrepare/RISCV/cttz-ctlz.ll | 133 +++++++++++++-
.../CodeGenPrepare/X86/cttz-ctlz.ll | 169 ++++++++++++++++++
3 files changed, 320 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22d0708f54786..a3dd712a6db83 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2310,6 +2310,26 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
return false;
+ // Do not despeculate if we have (ctlz/cttz (xor op -1)) if the operand is
+ // promoted as legalisation should be later able to transform it to:
+ //
+ // ctlz:
+ // (ctlz_zero_undef (lshift (xor (extend op) -1)
+ // lshiftamount))
+ //
+ // cttz:
+ // (cttz_zero_undef (xor (zeroextend op) -1))
+ //
+ // Despeculation is not only useless but also not wanted with SelectionDAG
+ // as XOR and CTLZ/CTTZ would be in different basic blocks.
+ EVT VTy = TLI->getValueType(*DL, Ty);
+ int ISDOpcode = IntrinsicID == Intrinsic::ctlz ? ISD::CTLZ : ISD::CTTZ;
+ if (match(CountZeros->getOperand(0), m_Not(m_Value())) &&
+ (TLI->getTypeAction(CountZeros->getContext(), VTy) ==
+ TargetLowering::TypePromoteInteger ||
+ TLI->getOperationAction(ISDOpcode, VTy) == TargetLowering::Promote))
+ return false;
+
// Bail if the value is never zero.
Use &Op = CountZeros->getOperandUse(0);
if (isKnownNonZero(Op, *DL))
diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll
index 00ad32e967489..6c85b3cd34f69 100644
--- a/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/cttz-ctlz.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s
-
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefixes=CHECK,SLOW
+; RUN: opt -S -mattr=+rva22u64 -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck %s --check-prefixes=CHECK,FAST
target triple = "riscv64-unknown-unknown"
; Check that despeculating count-zeros intrinsics doesn't crash when those
@@ -24,5 +24,134 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %x) {
ret <vscale x 4 x i64> %z
}
+; If the intrinsic is cheap, nothing should change.
+; If the intrinsic is expensive, check if the input is zero to avoid the call.
+; This is undoing speculation that may have been created by SimplifyCFG + InstCombine.
+
+define i64 @cttz(i64 %A) {
+; SLOW-LABEL: @cttz(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]]
+; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
+; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; SLOW: cond.false:
+; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A_FR]], i1 true)
+; SLOW-NEXT: br label [[COND_END]]
+; SLOW: cond.end:
+; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; SLOW-NEXT: ret i64 [[CTZ]]
+;
+; FAST-LABEL: @cttz(
+; FAST-NEXT: entry:
+; FAST-NEXT: [[Z:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
+; FAST-NEXT: ret i64 [[Z]]
+;
+entry:
+ %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
+ ret i64 %z
+}
+
+define i64 @ctlz(i64 %A) {
+; SLOW-LABEL: @ctlz(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_FR:%.*]] = freeze i64 [[A:%.*]]
+; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[A_FR]], 0
+; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; SLOW: cond.false:
+; SLOW-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A_FR]], i1 true)
+; SLOW-NEXT: br label [[COND_END]]
+; SLOW: cond.end:
+; SLOW-NEXT: [[CTZ:%.*]] = phi i64 [ 64, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; SLOW-NEXT: ret i64 [[CTZ]]
+;
+; FAST-LABEL: @ctlz(
+; FAST-NEXT: entry:
+; FAST-NEXT: [[Z:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
+; FAST-NEXT: ret i64 [[Z]]
+;
+entry:
+ %z = call i64 @llvm.ctlz.i64(i64 %A, i1 false)
+ ret i64 %z
+}
+
+define i8 @cttz_i8(i8 %A) {
+; SLOW-LABEL: @cttz_i8(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_FR:%.*]] = freeze i8 [[A:%.*]]
+; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_FR]], 0
+; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; SLOW: cond.false:
+; SLOW-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A_FR]], i1 true)
+; SLOW-NEXT: br label [[COND_END]]
+; SLOW: cond.end:
+; SLOW-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; SLOW-NEXT: ret i8 [[CTZ]]
+;
+; FAST-LABEL: @cttz_i8(
+; FAST-NEXT: entry:
+; FAST-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A:%.*]], i1 false)
+; FAST-NEXT: ret i8 [[Z]]
+;
+ entry:
+ %z = call i8 @llvm.cttz.i8(i8 %A, i1 false)
+ ret i8 %z
+}
+
+define i8 @ctlz_i8(i8 %A) {
+; SLOW-LABEL: @ctlz_i8(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_FR:%.*]] = freeze i8 [[A:%.*]]
+; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_FR]], 0
+; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; SLOW: cond.false:
+; SLOW-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_FR]], i1 true)
+; SLOW-NEXT: br label [[COND_END]]
+; SLOW: cond.end:
+; SLOW-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; SLOW-NEXT: ret i8 [[CTZ]]
+;
+; FAST-LABEL: @ctlz_i8(
+; FAST-NEXT: entry:
+; FAST-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A:%.*]], i1 false)
+; FAST-NEXT: ret i8 [[Z]]
+;
+ entry:
+ %z = call i8 @llvm.ctlz.i8(i8 %A, i1 false)
+ ret i8 %z
+}
+
+; As the operand will be promoted by the type legalizer, no despeculation when counting
+; ones.
+
+define i8 @ctto_i8_no_despeculation(i8 %A) {
+; CHECK-LABEL: @ctto_i8_no_despeculation(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; CHECK-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A_NOT]], i1 false)
+; CHECK-NEXT: ret i8 [[Z]]
+;
+ entry:
+ %A.not = xor i8 %A, -1
+ %z = call i8 @llvm.cttz.i8(i8 %A.not, i1 false)
+ ret i8 %z
+}
+
+define i8 @ctlo_i8_no_despeculation(i8 %A) {
+; CHECK-LABEL: @ctlo_i8_no_despeculation(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; CHECK-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_NOT]], i1 false)
+; CHECK-NEXT: ret i8 [[Z]]
+;
+ entry:
+ %A.not = xor i8 %A, -1
+ %z = call i8 @llvm.ctlz.i8(i8 %A.not, i1 false)
+ ret i8 %z
+}
+
declare <vscale x 4 x i64> @llvm.cttz.nxv4i64(<vscale x 4 x i64>, i1)
declare <vscale x 4 x i64> @llvm.ctlz.nxv4i64(<vscale x 4 x i64>, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i8 @llvm.ctlz.i8(i8, i1)
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
index 06909d950addb..b4f72ea474e1e 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/cttz-ctlz.ll
@@ -109,6 +109,175 @@ entry:
ret i64 %z
}
+define i8 @cttz_i8(i8 %A) {
+; SLOW-LABEL: @cttz_i8(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A:%.*]], i1 false)
+; SLOW-NEXT: ret i8 [[Z]]
+;
+; FAST_TZ-LABEL: @cttz_i8(
+; FAST_TZ-NEXT: entry:
+; FAST_TZ-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A:%.*]], i1 false)
+; FAST_TZ-NEXT: ret i8 [[Z]]
+;
+; FAST_LZ-LABEL: @cttz_i8(
+; FAST_LZ-NEXT: entry:
+; FAST_LZ-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A:%.*]], i1 false)
+; FAST_LZ-NEXT: ret i8 [[Z]]
+;
+; DEBUGINFO-LABEL: @cttz_i8(
+; DEBUGINFO-NEXT: entry:
+; DEBUGINFO-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A:%.*]], i1 false), !dbg [[DBG22:![0-9]+]]
+; DEBUGINFO-NEXT: #dbg_value(i8 [[Z]], [[META20:![0-9]+]], !DIExpression(), [[DBG22]])
+; DEBUGINFO-NEXT: ret i8 [[Z]], !dbg [[DBG23:![0-9]+]]
+;
+ entry:
+ %z = call i8 @llvm.cttz.i8(i8 %A, i1 false)
+ ret i8 %z
+}
+
+define i8 @ctlz_i8(i8 %A) {
+; SLOW-LABEL: @ctlz_i8(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_FR:%.*]] = freeze i8 [[A:%.*]]
+; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_FR]], 0
+; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; SLOW: cond.false:
+; SLOW-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_FR]], i1 true)
+; SLOW-NEXT: br label [[COND_END]]
+; SLOW: cond.end:
+; SLOW-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; SLOW-NEXT: ret i8 [[CTZ]]
+;
+; FAST_TZ-LABEL: @ctlz_i8(
+; FAST_TZ-NEXT: entry:
+; FAST_TZ-NEXT: [[A_FR:%.*]] = freeze i8 [[A:%.*]]
+; FAST_TZ-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_FR]], 0
+; FAST_TZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; FAST_TZ: cond.false:
+; FAST_TZ-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_FR]], i1 true)
+; FAST_TZ-NEXT: br label [[COND_END]]
+; FAST_TZ: cond.end:
+; FAST_TZ-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; FAST_TZ-NEXT: ret i8 [[CTZ]]
+;
+; FAST_LZ-LABEL: @ctlz_i8(
+; FAST_LZ-NEXT: entry:
+; FAST_LZ-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A:%.*]], i1 false)
+; FAST_LZ-NEXT: ret i8 [[Z]]
+;
+; DEBUGINFO-LABEL: @ctlz_i8(
+; DEBUGINFO-NEXT: entry:
+; DEBUGINFO-NEXT: [[A_FR:%.*]] = freeze i8 [[A:%.*]], !dbg [[DBG27:![0-9]+]]
+; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_FR]], 0, !dbg [[DBG27]]
+; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG27]]
+; DEBUGINFO: cond.false:
+; DEBUGINFO-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_FR]], i1 true), !dbg [[DBG27]]
+; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG28:![0-9]+]]
+; DEBUGINFO: cond.end:
+; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG28]]
+; DEBUGINFO-NEXT: #dbg_value(i8 [[CTZ]], [[META26:![0-9]+]], !DIExpression(), [[DBG27]])
+; DEBUGINFO-NEXT: ret i8 [[CTZ]], !dbg [[DBG28]]
+;
+ entry:
+ %z = call i8 @llvm.ctlz.i8(i8 %A, i1 false)
+ ret i8 %z
+}
+
+; As the operand will be promoted by the DAG legalizer, no despeculation when counting
+; ones.
+
+define i8 @ctto_i8_no_despeculation(i8 %A) {
+; SLOW-LABEL: @ctto_i8_no_despeculation(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; SLOW-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A_NOT]], i1 false)
+; SLOW-NEXT: ret i8 [[Z]]
+;
+; FAST_TZ-LABEL: @ctto_i8_no_despeculation(
+; FAST_TZ-NEXT: entry:
+; FAST_TZ-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; FAST_TZ-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A_NOT]], i1 false)
+; FAST_TZ-NEXT: ret i8 [[Z]]
+;
+; FAST_LZ-LABEL: @ctto_i8_no_despeculation(
+; FAST_LZ-NEXT: entry:
+; FAST_LZ-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; FAST_LZ-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A_NOT]], i1 false)
+; FAST_LZ-NEXT: ret i8 [[Z]]
+;
+; DEBUGINFO-LABEL: @ctto_i8_no_despeculation(
+; DEBUGINFO-NEXT: entry:
+; DEBUGINFO-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1, !dbg [[DBG33:![0-9]+]]
+; DEBUGINFO-NEXT: #dbg_value(i8 [[A_NOT]], [[META31:![0-9]+]], !DIExpression(), [[DBG33]])
+; DEBUGINFO-NEXT: [[Z:%.*]] = call i8 @llvm.cttz.i8(i8 [[A_NOT]], i1 false), !dbg [[DBG34:![0-9]+]]
+; DEBUGINFO-NEXT: #dbg_value(i8 [[Z]], [[META32:![0-9]+]], !DIExpression(), [[DBG34]])
+; DEBUGINFO-NEXT: ret i8 [[Z]], !dbg [[DBG35:![0-9]+]]
+;
+ entry:
+ %A.not = xor i8 %A, -1
+ %z = call i8 @llvm.cttz.i8(i8 %A.not, i1 false)
+ ret i8 %z
+}
+
+; despeculation occurs because with CTLZ i8, the DAG legalization is Custom.
+
+define i8 @ctlo_i8(i8 %A) {
+; SLOW-LABEL: @ctlo_i8(
+; SLOW-NEXT: entry:
+; SLOW-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; SLOW-NEXT: [[A_NOT_FR:%.*]] = freeze i8 [[A_NOT]]
+; SLOW-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_NOT_FR]], 0
+; SLOW-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; SLOW: cond.false:
+; SLOW-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_NOT_FR]], i1 true)
+; SLOW-NEXT: br label [[COND_END]]
+; SLOW: cond.end:
+; SLOW-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; SLOW-NEXT: ret i8 [[CTZ]]
+;
+; FAST_TZ-LABEL: @ctlo_i8(
+; FAST_TZ-NEXT: entry:
+; FAST_TZ-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; FAST_TZ-NEXT: [[A_NOT_FR:%.*]] = freeze i8 [[A_NOT]]
+; FAST_TZ-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_NOT_FR]], 0
+; FAST_TZ-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; FAST_TZ: cond.false:
+; FAST_TZ-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_NOT_FR]], i1 true)
+; FAST_TZ-NEXT: br label [[COND_END]]
+; FAST_TZ: cond.end:
+; FAST_TZ-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ]
+; FAST_TZ-NEXT: ret i8 [[CTZ]]
+;
+; FAST_LZ-LABEL: @ctlo_i8(
+; FAST_LZ-NEXT: entry:
+; FAST_LZ-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1
+; FAST_LZ-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_NOT]], i1 false)
+; FAST_LZ-NEXT: ret i8 [[Z]]
+;
+; DEBUGINFO-LABEL: @ctlo_i8(
+; DEBUGINFO-NEXT: entry:
+; DEBUGINFO-NEXT: [[A_NOT:%.*]] = xor i8 [[A:%.*]], -1, !dbg [[DBG40:![0-9]+]]
+; DEBUGINFO-NEXT: #dbg_value(i8 [[A_NOT]], [[META38:![0-9]+]], !DIExpression(), [[DBG40]])
+; DEBUGINFO-NEXT: [[A_NOT_FR:%.*]] = freeze i8 [[A_NOT]], !dbg [[DBG41:![0-9]+]]
+; DEBUGINFO-NEXT: [[CMPZ:%.*]] = icmp eq i8 [[A_NOT_FR]], 0, !dbg [[DBG41]]
+; DEBUGINFO-NEXT: br i1 [[CMPZ]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG41]]
+; DEBUGINFO: cond.false:
+; DEBUGINFO-NEXT: [[Z:%.*]] = call i8 @llvm.ctlz.i8(i8 [[A_NOT_FR]], i1 true), !dbg [[DBG41]]
+; DEBUGINFO-NEXT: br label [[COND_END]], !dbg [[DBG42:![0-9]+]]
+; DEBUGINFO: cond.end:
+; DEBUGINFO-NEXT: [[CTZ:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[Z]], [[COND_FALSE]] ], !dbg [[DBG42]]
+; DEBUGINFO-NEXT: #dbg_value(i8 [[CTZ]], [[META39:![0-9]+]], !DIExpression(), [[DBG41]])
+; DEBUGINFO-NEXT: ret i8 [[CTZ]], !dbg [[DBG42]]
+;
+ entry:
+ %A.not = xor i8 %A, -1
+ %z = call i8 @llvm.ctlz.i8(i8 %A.not, i1 false)
+ ret i8 %z
+}
+
declare i64 @llvm.cttz.i64(i64, i1)
declare i64 @llvm.ctlz.i64(i64, i1)
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i8 @llvm.ctlz.i8(i8, i1)
>From 8b95ef83f2ade9f7070ae153b8f5915f1fc58695 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 9 Jul 2024 16:42:39 +0200
Subject: [PATCH 2/5] [CodeGen] Legalisation with promotion: optimise count
leading ones
(CTLZ (XOR Op -1))
-->
(CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op) ShiftAmount) -1))
The optimisation also applies for CTLZ_ZERO_UNDEF, VP_CTLZ, VP_CTLZ_ZERO_UNDEF.
Fixes https://github.com/llvm/llvm-project/issues/96455
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 26 +++++
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 33 ++++++
.../SelectionDAG/LegalizeIntegerTypes.cpp | 41 +++++++
llvm/test/CodeGen/AArch64/ctlo.ll | 100 ++++++++++++++++++
.../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 24 ++---
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 52 +++++++++
llvm/test/CodeGen/X86/ctlo.ll | 26 ++---
7 files changed, 271 insertions(+), 31 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/ctlo.ll
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6c7885c491f41..b7d75e27ac1ff 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2371,6 +2371,25 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
return Legalized;
}
+static bool extendCtlzNot(const MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, LLT WideTy) {
+ Register Src;
+ if (!mi_match(MI.getOperand(1).getReg(), MRI, m_Not(m_Reg(Src))))
+ return false;
+
+ auto ExtSrc = MIRBuilder.buildAnyExt(WideTy, Src);
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT CurTy = MRI.getType(SrcReg);
+ unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
+ auto LShift = MIRBuilder.buildShl(WideTy, ExtSrc,
+ MIRBuilder.buildConstant(WideTy, SizeDiff));
+ auto Not = MIRBuilder.buildNot(WideTy, LShift);
+ MIRBuilder.buildCTLZ_ZERO_UNDEF(MI.getOperand(0), Not);
+
+ return true;
+}
+
LegalizerHelper::LegalizeResult
LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
switch (MI.getOpcode()) {
@@ -2464,6 +2483,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
LLT CurTy = MRI.getType(SrcReg);
unsigned NewOpc = MI.getOpcode();
+
+ if ((MI.getOpcode() == TargetOpcode::G_CTLZ ||
+ MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) &&
+ extendCtlzNot(MI, MIRBuilder, MRI, WideTy)) {
+ MI.eraseFromParent();
+ return Legalized;
+ }
if (NewOpc == TargetOpcode::G_CTTZ) {
// The count is the same in the larger type except if the original
// value was zero. This can be handled by setting the bit just off
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 3a39f6a4d2b4a..a5a62bc82d381 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -26,6 +26,7 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/RuntimeLibcallUtil.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
@@ -54,6 +55,7 @@
#include <utility>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "legalizedag"
@@ -5089,6 +5091,26 @@ static MVT getPromotedVectorElementType(const TargetLowering &TLI,
return MidVT;
}
+// (CTLZ (XOR Op -1)) --> (TRUNCATE (CTLZ_ZERO_UNDEF
+// (XOR (SHIFT (ANYEXTEND Op1)
+// ShiftAmount)
+// -1)))
+static SDValue ExtendCtlzNot(SDNode *Node, SDLoc &dl, MVT OVT, MVT NVT,
+ SelectionDAG &DAG) {
+ SDValue SrcOp;
+ if (!sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp))))
+ return SDValue();
+
+ SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+ unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+ SDValue ShiftConst =
+ DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+ SDValue LShift = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+ SDValue Not = DAG.getNOT(dl, LShift, NVT);
+ SDValue Ctlz = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Not);
+ return DAG.getNode(ISD::TRUNCATE, dl, OVT, Ctlz);
+}
+
void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
LLVM_DEBUG(dbgs() << "Trying to promote node\n");
SmallVector<SDValue, 8> Results;
@@ -5124,6 +5146,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
case ISD::CTPOP: {
+ // If the operand of CTLZ is NOT, push the extend in the NOT.
+ if (Node->getOpcode() == ISD::CTLZ &&
+ (Tmp1 = ExtendCtlzNot(Node, dl, OVT, NVT, DAG))) {
+ Results.push_back(Tmp1);
+ break;
+ }
+
// Zero extend the argument unless its cttz, then use any_extend.
if (Node->getOpcode() == ISD::CTTZ ||
Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
@@ -5155,6 +5184,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
break;
}
case ISD::CTLZ_ZERO_UNDEF: {
+ if (Tmp1 = ExtendCtlzNot(Node, dl, OVT, NVT, DAG)) {
+ Results.push_back(Tmp1);
+ break;
+ }
// We know that the argument is unlikely to be zero, hence we can take a
// different approach as compared to ISD::CTLZ
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index af77b0070df0a..d96e485a6e5d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -19,6 +19,7 @@
#include "LegalizeTypes.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/DerivedTypes.h"
@@ -27,6 +28,7 @@
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "legalize-types"
@@ -646,6 +648,37 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
return Result;
}
+// (CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op1)
+// ShiftAmount)
+// -1))
+static SDValue ExtendCtlzNot(SDNode *Node, SDLoc &dl, EVT OVT, EVT NVT,
+ SelectionDAG &DAG) {
+ SDValue SrcOp;
+ if (!sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp))))
+ return SDValue();
+
+ SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+ unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+ SDValue ShiftConst =
+ DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+
+ SDValue NCstOp =
+ DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
+ if (!Node->isVPOpcode()) {
+ SDValue LShift = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+ SDValue Not = DAG.getNOT(dl, LShift, NVT);
+ return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Not);
+ }
+
+ SDValue Mask = Node->getOperand(1);
+ SDValue EVL = Node->getOperand(2);
+
+ SDValue LShift =
+ DAG.getNode(ISD::VP_SHL, dl, NVT, ExtSrc, ShiftConst, Mask, EVL);
+ SDValue Not = DAG.getNode(ISD::VP_XOR, dl, NVT, LShift, NCstOp, Mask, EVL);
+ return DAG.getNode(ISD::VP_CTLZ_ZERO_UNDEF, dl, NVT, Not, Mask, EVL);
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
EVT OVT = N->getValueType(0);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
@@ -664,6 +697,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
}
unsigned CtlzOpcode = N->getOpcode();
+ // If the operand of CTLZ is NOT, push the extend in the NOT.
+ if (SDValue Res;
+ (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
+ CtlzOpcode == ISD::VP_CTLZ || CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) &&
+ (Res = ExtendCtlzNot(N, dl, OVT, NVT, DAG))) {
+ return Res;
+ }
+
if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
// Subtract off the extra leading bits in the bigger type.
SDValue ExtractLeadingBits = DAG.getConstant(
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 0000000000000..5f15f540f458d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-LABEL: ctlo_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+ ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-LABEL: ctlo_i8_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+ ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-LABEL: ctlo_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+ ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-LABEL: ctlo_i16_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+ ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+ ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+ ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+ ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+ ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f323..e993ecfcdf3b8 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i8:
; LA32: # %bb.0:
-; LA32-NEXT: ori $a1, $zero, 255
-; LA32-NEXT: andn $a0, $a1, $a0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -24
+; LA32-NEXT: slli.w $a0, $a0, 24
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i8:
; LA64: # %bb.0:
-; LA64-NEXT: ori $a1, $zero, 255
-; LA64-NEXT: andn $a0, $a1, $a0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -56
+; LA64-NEXT: slli.d $a0, $a0, 56
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i8 %a, -1
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i16:
; LA32: # %bb.0:
-; LA32-NEXT: nor $a0, $a0, $zero
-; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -16
+; LA32-NEXT: slli.w $a0, $a0, 16
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i16:
; LA64: # %bb.0:
-; LA64-NEXT: nor $a0, $a0, $zero
-; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -48
+; LA64-NEXT: slli.d $a0, $a0, 48
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i16 %a, -1
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 58882525e55c4..6f89489bb39d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2624,6 +2624,58 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
}
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 7431f94f0fdf2..020d6d1b80136 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
%tmp1 = xor i8 %x, -1
@@ -89,20 +87,18 @@ define i8 @ctlo_i8_undef(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8_undef:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8_undef:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: shll $24, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
%tmp1 = xor i8 %x, -1
>From 265457921cc9691628a5556287aca13ff29d43bd Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Fri, 2 Aug 2024 15:57:31 +0200
Subject: [PATCH 3/5] Count leading ones promoted type optimisation: Support
VP_XOR
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 33 ++++++-
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 89 +++++++++++++++++++
2 files changed, 118 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index d96e485a6e5d3..7df131cbe0b53 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -651,10 +651,36 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
// (CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op1)
// ShiftAmount)
// -1))
+//
+// The following Vector Predicated patterns will also be transformed
+// similarly to above using VP_CTLZ_ZERO_UNDEF and VP_XOR:
+//
+// - (VP_CTLZ (XOR Op -1) Mask VecLen)
+// - (VP_CTLZ (VP_XOR Op -1 Mask VecLen) Mask VecLen))
static SDValue ExtendCtlzNot(SDNode *Node, SDLoc &dl, EVT OVT, EVT NVT,
SelectionDAG &DAG) {
SDValue SrcOp;
- if (!sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp))))
+ if (sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp)))) {
+ } else if (Node->isVPOpcode() &&
+ Node->getOperand(0).getOpcode() == ISD::VP_XOR) {
+ SDValue VPXor = Node->getOperand(0);
+
+ SDValue Mask = Node->getOperand(1);
+ SDValue EVL = Node->getOperand(2);
+
+ SDValue VPXorMask = VPXor->getOperand(2);
+ SDValue VPXorEVL = VPXor->getOperand(3);
+
+ if (VPXorMask != Mask || VPXorEVL != EVL)
+ return SDValue();
+
+ if (isAllOnesOrAllOnesSplat(VPXor->getOperand(1))) {
+ SrcOp = VPXor->getOperand(0);
+ } else if (isAllOnesOrAllOnesSplat(VPXor->getOperand(0))) {
+ SrcOp = VPXor->getOperand(1);
+ } else
+ return SDValue();
+ } else
return SDValue();
SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
@@ -662,8 +688,6 @@ static SDValue ExtendCtlzNot(SDNode *Node, SDLoc &dl, EVT OVT, EVT NVT,
SDValue ShiftConst =
DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
- SDValue NCstOp =
- DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
if (!Node->isVPOpcode()) {
SDValue LShift = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
SDValue Not = DAG.getNOT(dl, LShift, NVT);
@@ -675,7 +699,8 @@ static SDValue ExtendCtlzNot(SDNode *Node, SDLoc &dl, EVT OVT, EVT NVT,
SDValue LShift =
DAG.getNode(ISD::VP_SHL, dl, NVT, ExtSrc, ShiftConst, Mask, EVL);
- SDValue Not = DAG.getNode(ISD::VP_XOR, dl, NVT, LShift, NCstOp, Mask, EVL);
+ SDValue Not = DAG.getNode(ISD::VP_XOR, dl, NVT, LShift,
+ DAG.getAllOnesConstant(dl, NVT), Mask, EVL);
return DAG.getNode(ISD::VP_CTLZ_ZERO_UNDEF, dl, NVT, Not, Mask, EVL);
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 6f89489bb39d6..a0c3c77283a62 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2676,6 +2676,95 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
}
+
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+
+define <vscale x 1 x i9> @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vand.vx v8, v8, a1
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8
+; CHECK-NEXT: vnsrl.wi v8, v9, 23
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1
+; CHECK-ZVBB-NEXT: vclz.v v8, v8
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0
+; CHECK-ZVBB-NEXT: ret
+ %va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
+ %v = call <vscale x 1 x i9> @llvm.ctlz(<vscale x 1 x i9> %va.not, i1 false)
+ ret <vscale x 1 x i9> %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
>From 35551b7af63fdfef09461388f0ffaae99acd11ec Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Thu, 8 Aug 2024 06:59:47 +0200
Subject: [PATCH 4/5] Counting leading ones: Optimise only for CTLZ
Do not optimise for CTLZ_ZERO_UNDEF
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 3 +-
.../SelectionDAG/LegalizeIntegerTypes.cpp | 6 +--
llvm/test/CodeGen/AArch64/ctlo.ll | 42 ++++++++++++-------
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 14 +++++--
4 files changed, 41 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index b7d75e27ac1ff..06ebea25e0a8e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2484,8 +2484,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
LLT CurTy = MRI.getType(SrcReg);
unsigned NewOpc = MI.getOpcode();
- if ((MI.getOpcode() == TargetOpcode::G_CTLZ ||
- MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) &&
+ if ((MI.getOpcode() == TargetOpcode::G_CTLZ) &&
extendCtlzNot(MI, MIRBuilder, MRI, WideTy)) {
MI.eraseFromParent();
return Legalized;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 7df131cbe0b53..c9d7ba86834d1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -723,10 +723,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
unsigned CtlzOpcode = N->getOpcode();
// If the operand of CTLZ is NOT, push the extend in the NOT.
- if (SDValue Res;
- (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
- CtlzOpcode == ISD::VP_CTLZ || CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) &&
- (Res = ExtendCtlzNot(N, dl, OVT, NVT, DAG))) {
+ if (SDValue Res; (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) &&
+ (Res = ExtendCtlzNot(N, dl, OVT, NVT, DAG))) {
return Res;
}
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
index 5f15f540f458d..f61a10ccaf7ca 100644
--- a/llvm/test/CodeGen/AArch64/ctlo.ll
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
declare i8 @llvm.ctlz.i8(i8, i1)
declare i16 @llvm.ctlz.i16(i16, i1)
@@ -20,12 +20,19 @@ define i8 @ctlo_i8(i8 %x) {
}
define i8 @ctlo_i8_undef(i8 %x) {
-; CHECK-LABEL: ctlo_i8_undef:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: eor w8, w8, w0, lsl #24
-; CHECK-NEXT: clz w0, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ctlo_i8_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #24
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i8_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-GI-NEXT: clz w0, w8
+; CHECK-GI-NEXT: ret
%tmp1 = xor i8 %x, -1
%tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
ret i8 %tmp2
@@ -44,12 +51,19 @@ define i16 @ctlo_i16(i16 %x) {
}
define i16 @ctlo_i16_undef(i16 %x) {
-; CHECK-LABEL: ctlo_i16_undef:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: eor w8, w8, w0, lsl #16
-; CHECK-NEXT: clz w0, w8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: ctlo_i16_undef:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mvn w8, w0
+; CHECK-SD-NEXT: lsl w8, w8, #16
+; CHECK-SD-NEXT: clz w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: ctlo_i16_undef:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-GI-NEXT: clz w0, w8
+; CHECK-GI-NEXT: ret
%tmp1 = xor i16 %x, -1
%tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
ret i16 %tmp2
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index a0c3c77283a62..9ea1394a1dd2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2653,9 +2653,11 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
-; CHECK-NEXT: vnot.v v8, v8, v0.t
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
@@ -2667,9 +2669,11 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
;
; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
; CHECK-ZVBB-NEXT: ret
%va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
@@ -2707,9 +2711,10 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x
define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
-; CHECK-NEXT: vnot.v v8, v8, v0.t
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
@@ -2721,9 +2726,10 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9_vp_xor(<vscale x 1 x i9> %va
;
; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9_vp_xor:
; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
; CHECK-ZVBB-NEXT: ret
%va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
>From 915c34e763baa92a6fbe232ec4d560168a853dd0 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Thu, 8 Aug 2024 09:05:30 +0200
Subject: [PATCH 5/5] Counting leading ones: Optimise VP_CTLZ only with VP_XOR
Do not optimise for VP_CTLZ + XOR. That way, the non-VP and VP cases
are more clearly separated which is easier to follow.
---
.../SelectionDAG/LegalizeIntegerTypes.cpp | 58 +++++++++++--------
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 18 ++++--
2 files changed, 48 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c9d7ba86834d1..d3aa0f6b38015 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -659,44 +659,54 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
// - (VP_CTLZ (VP_XOR Op -1 Mask VecLen) Mask VecLen))
static SDValue ExtendCtlzNot(SDNode *Node, SDLoc &dl, EVT OVT, EVT NVT,
SelectionDAG &DAG) {
- SDValue SrcOp;
- if (sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp)))) {
- } else if (Node->isVPOpcode() &&
- Node->getOperand(0).getOpcode() == ISD::VP_XOR) {
- SDValue VPXor = Node->getOperand(0);
+ // helper to create both the any extend Src and the shift amount
+ auto ExtSrcAndShiftConst = [&](SDValue SrcOp) {
+ SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+ unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+ SDValue ShiftConst =
+ DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
- SDValue Mask = Node->getOperand(1);
- SDValue EVL = Node->getOperand(2);
+ return std::make_pair(ExtSrc, ShiftConst);
+ };
- SDValue VPXorMask = VPXor->getOperand(2);
- SDValue VPXorEVL = VPXor->getOperand(3);
+ if (!Node->isVPOpcode()) {
+ SDValue SrcOp;
- if (VPXorMask != Mask || VPXorEVL != EVL)
+ if (!sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp))))
return SDValue();
- if (isAllOnesOrAllOnesSplat(VPXor->getOperand(1))) {
- SrcOp = VPXor->getOperand(0);
- } else if (isAllOnesOrAllOnesSplat(VPXor->getOperand(0))) {
- SrcOp = VPXor->getOperand(1);
- } else
- return SDValue();
- } else
- return SDValue();
-
- SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
- unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
- SDValue ShiftConst =
- DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+ auto [ExtSrc, ShiftConst] = ExtSrcAndShiftConst(SrcOp);
- if (!Node->isVPOpcode()) {
SDValue LShift = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
SDValue Not = DAG.getNOT(dl, LShift, NVT);
return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Not);
}
+ if (Node->getOperand(0).getOpcode() != ISD::VP_XOR) {
+ return SDValue();
+ }
+
+ SDValue VPXor = Node->getOperand(0);
+
SDValue Mask = Node->getOperand(1);
SDValue EVL = Node->getOperand(2);
+ SDValue VPXorMask = VPXor->getOperand(2);
+ SDValue VPXorEVL = VPXor->getOperand(3);
+
+ if (VPXorMask != Mask || VPXorEVL != EVL)
+ return SDValue();
+
+ SDValue SrcOp;
+ if (isAllOnesOrAllOnesSplat(VPXor->getOperand(1))) {
+ SrcOp = VPXor->getOperand(0);
+ } else if (isAllOnesOrAllOnesSplat(VPXor->getOperand(0))) {
+ SrcOp = VPXor->getOperand(1);
+ } else
+ return SDValue();
+
+ auto [ExtSrc, ShiftConst] = ExtSrcAndShiftConst(SrcOp);
+
SDValue LShift =
DAG.getNode(ISD::VP_SHL, dl, NVT, ExtSrc, ShiftConst, Mask, EVL);
SDValue Not = DAG.getNode(ISD::VP_XOR, dl, NVT, LShift,
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 9ea1394a1dd2c..c991fd66e86ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2627,9 +2627,11 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ctlo_nxv1i9:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 511
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vxor.vx v8, v8, a1
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
-; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
@@ -2637,14 +2639,22 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
; CHECK-NEXT: li a0, 142
; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
+; CHECK-NEXT: li a0, 7
+; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: li a1, 511
+; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
-; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: li a0, 7
+; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-ZVBB-NEXT: ret
%va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
More information about the llvm-commits
mailing list