[llvm-branch-commits] [llvm] [DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences (PR #146054)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jun 30 01:27:52 PDT 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/146054
>From 17ac90ad1ee167f35321e01625a207f2b94ff523 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 26 Jun 2025 13:31:37 +0200
Subject: [PATCH 1/2] [DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences
Fold sequences where we extract a bunch of contiguous bits from a value,
merge them into the low bit and then check if the low bits are zero or not.
It seems like a strange sequence at first but it's an idiom used by device
libs in device libs to check workitem IDs for AMDGPU.
The reason I put this in DAGCombiner instead of the target combiner is
because this is a generic, valid transform that's also fairly niche, so
there isn't much risk of a combine loop I think.
See #136727
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 86 ++++++++++++++++++-
.../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 34 ++------
2 files changed, 91 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 08dab7c697b99..a189208d3a62e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -28909,13 +28909,97 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
return SDValue();
}
+static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ // Match a pattern such as:
+ // (X | (X >> C0) | (X >> C1) | ...) & Mask
+ // This extracts contiguous parts of X and ORs them together before comparing.
+ // We can optimize this so that we directly check (X & SomeMask) instead,
+ // eliminating the shifts.
+
+ EVT VT = Root.getValueType();
+
+ if (Root.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N0 = Root.getOperand(0);
+ SDValue N1 = Root.getOperand(1);
+
+ if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
+ return SDValue();
+
+ APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
+ if (!RootMask.isMask())
+ return SDValue();
+
+ SDValue Src;
+ const auto IsSrc = [&](SDValue V) {
+ if (!Src) {
+ Src = V;
+ return true;
+ }
+
+ return Src == V;
+ };
+
+ SmallVector<SDValue> Worklist = {N0};
+ APInt PartsMask(VT.getSizeInBits(), 0);
+ while (!Worklist.empty()) {
+ SDValue V = Worklist.pop_back_val();
+ if (!V.hasOneUse() && Src != V)
+ return SDValue();
+
+ if (V.getOpcode() == ISD::OR) {
+ Worklist.push_back(V.getOperand(0));
+ Worklist.push_back(V.getOperand(1));
+ continue;
+ }
+
+ if (V.getOpcode() == ISD::SRL) {
+ SDValue ShiftSrc = V.getOperand(0);
+ SDValue ShiftAmt = V.getOperand(1);
+
+ if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
+ return SDValue();
+
+ PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal());
+ continue;
+ }
+
+ if (IsSrc(V)) {
+ PartsMask |= RootMask;
+ continue;
+ }
+
+ return SDValue();
+ }
+
+ if (!RootMask.isMask() || !Src)
+ return SDValue();
+
+ SDLoc DL(Root);
+ return DAG.getNode(ISD::AND, DL, VT,
+ {Src, DAG.getConstant(PartsMask, DL, VT)});
+}
+
/// This is a stub for TargetLowering::SimplifySetCC.
SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
ISD::CondCode Cond, const SDLoc &DL,
bool foldBooleans) {
TargetLowering::DAGCombinerInfo
DagCombineInfo(DAG, Level, false, this);
- return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+ if (SDValue C =
+ TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
+ return C;
+
+ if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) &&
+ N0.getOpcode() == ISD::AND && isNullConstant(N1)) {
+
+ if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
+ return DAG.getSetCC(DL, VT, Res, N1, Cond);
+ }
+
+ return SDValue();
}
/// Given an ISD::SDIV node expressing a divide by constant, return
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 07c4aeb1ac7df..64d055bc40e98 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -12,11 +12,7 @@ define i1 @workitem_zero() {
; DAGISEL-GFX8-LABEL: workitem_zero:
; DAGISEL-GFX8: ; %bb.0: ; %entry
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
-; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -24,10 +20,7 @@ define i1 @workitem_zero() {
; DAGISEL-GFX942-LABEL: workitem_zero:
; DAGISEL-GFX942: ; %bb.0: ; %entry
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; DAGISEL-GFX942-NEXT: s_nop 1
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -40,11 +33,7 @@ define i1 @workitem_zero() {
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
-; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
@@ -106,11 +95,7 @@ define i1 @workitem_nonzero() {
; DAGISEL-GFX8-LABEL: workitem_nonzero:
; DAGISEL-GFX8: ; %bb.0: ; %entry
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1
-; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -118,10 +103,7 @@ define i1 @workitem_nonzero() {
; DAGISEL-GFX942-LABEL: workitem_nonzero:
; DAGISEL-GFX942: ; %bb.0: ; %entry
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; DAGISEL-GFX942-NEXT: s_nop 1
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0
-; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31
; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd
>From bb21ba46b2d079da52bee2c3c4fb108c51f4804e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 30 Jun 2025 10:19:44 +0200
Subject: [PATCH 2/2] comments
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +--
llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll | 102 ++++++++++++++++++
2 files changed, 110 insertions(+), 6 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a189208d3a62e..04ac68bc69fae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -28919,7 +28919,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
EVT VT = Root.getValueType();
- if (Root.getOpcode() != ISD::AND)
+ if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND)
return SDValue();
SDValue N0 = Root.getOperand(0);
@@ -28929,8 +28929,6 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
return SDValue();
APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
- if (!RootMask.isMask())
- return SDValue();
SDValue Src;
const auto IsSrc = [&](SDValue V) {
@@ -28946,7 +28944,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
APInt PartsMask(VT.getSizeInBits(), 0);
while (!Worklist.empty()) {
SDValue V = Worklist.pop_back_val();
- if (!V.hasOneUse() && Src != V)
+ if (!V.hasOneUse() && (Src && Src != V))
return SDValue();
if (V.getOpcode() == ISD::OR) {
@@ -28962,7 +28960,11 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
return SDValue();
- PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal());
+ auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal();
+ if (ShiftAmtVal > RootMask.getBitWidth())
+ return SDValue();
+
+ PartsMask |= (RootMask << ShiftAmtVal);
continue;
}
@@ -28974,7 +28976,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
return SDValue();
}
- if (!RootMask.isMask() || !Src)
+ if (!Src)
return SDValue();
SDLoc DL(Root);
diff --git a/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll b/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll
new file mode 100644
index 0000000000000..9d415484d4f9c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -mtriple=amdgcn -mcpu=fiji %s -o - | FileCheck %s
+
+define i1 @basic_eq_i16_3x5(i16 %arg) {
+; CHECK-LABEL: basic_eq_i16_3x5:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = and i16 %arg, 31
+ %sh5 = lshr i16 %arg, 5
+ %b = and i16 %sh5, 31
+ %or = or i16 %a, %b
+ %sh10 = lshr i16 %arg, 10
+ %c = and i16 %sh10, 31
+ %or1 = or i16 %or, %c
+ %cmp = icmp eq i16 %or1, 0
+ ret i1 %cmp
+}
+
+define i1 @basic_eq_i32_3x5(i32 %arg) {
+; CHECK-LABEL: basic_eq_i32_3x5:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = and i32 %arg, 31
+ %sh5 = lshr i32 %arg, 5
+ %b = and i32 %sh5, 31
+ %or = or i32 %a, %b
+ %sh10 = lshr i32 %arg, 10
+ %c = and i32 %sh10, 31
+ %or1 = or i32 %or, %c
+ %cmp = icmp eq i32 %or1, 0
+ ret i1 %cmp
+}
+
+define i1 @basic_eq_i64_3x5(i64 %arg) {
+; CHECK-LABEL: basic_eq_i64_3x5:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = and i64 %arg, 31
+ %sh5 = lshr i64 %arg, 5
+ %b = and i64 %sh5, 31
+ %or = or i64 %a, %b
+ %sh10 = lshr i64 %arg, 10
+ %c = and i64 %sh10, 31
+ %or1 = or i64 %or, %c
+ %cmp = icmp eq i64 %or1, 0
+ ret i1 %cmp
+}
+
+define i1 @basic_ne_i32_3x5(i32 %arg) {
+; CHECK-LABEL: basic_ne_i32_3x5:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = and i32 %arg, 31
+ %sh5 = lshr i32 %arg, 5
+ %b = and i32 %sh5, 31
+ %or = or i32 %a, %b
+ %sh10 = lshr i32 %arg, 10
+ %c = and i32 %sh10, 31
+ %or1 = or i32 %or, %c
+ %cmp = icmp ne i32 %or1, 0
+ ret i1 %cmp
+}
+
+define i1 @eq_i32_3x5_holes_in_mask(i32 %arg) {
+; CHECK-LABEL: eq_i32_3x5_holes_in_mask:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x7f9f, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %a = and i32 %arg, 31
+ %sh5 = lshr i32 %arg, 7
+ %b = and i32 %sh5, 31
+ %or = or i32 %a, %b
+ %sh10 = lshr i32 %arg, 10
+ %c = and i32 %sh10, 31
+ %or1 = or i32 %or, %c
+ %cmp = icmp ne i32 %or1, 0
+ ret i1 %cmp
+}
More information about the llvm-branch-commits
mailing list