[llvm-branch-commits] [llvm] [DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences (PR #146054)

Fri Jun 27 03:46:03 PDT 2025

https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/146054

Fold sequences where we extract a bunch of contiguous bits from a value,
merge them into the low bit and then check if the low bits are zero or not.

It seems like a strange sequence at first but it's an idiom used by device
libs in device libs to check workitem IDs for AMDGPU.

The reason I put this in DAGCombiner instead of the target combiner is
because this is a generic, valid transform that's also fairly niche, so
there isn't much risk of a combine loop I think.

See #136727

>From 7ff08069adcd328f4356c28cfffea8aa004a4c50 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 26 Jun 2025 13:31:37 +0200
Subject: [PATCH] [DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences

Fold sequences where we extract a bunch of contiguous bits from a value,
merge them into the low bit and then check if the low bits are zero or not.

It seems like a strange sequence at first but it's an idiom used by device
libs in device libs to check workitem IDs for AMDGPU.

The reason I put this in DAGCombiner instead of the target combiner is
because this is a generic, valid transform that's also fairly niche, so
there isn't much risk of a combine loop I think.

See #136727
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 86 ++++++++++++++++++-
 .../AMDGPU/workitems-intrinsics-opts.ll       | 34 ++------
 2 files changed, 91 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 08dab7c697b99..a189208d3a62e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -28909,13 +28909,97 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
+  // Match a pattern such as:
+  //  (X | (X >> C0) | (X >> C1) | ...) & Mask
+  // This extracts contiguous parts of X and ORs them together before comparing.
+  // We can optimize this so that we directly check (X & SomeMask) instead,
+  // eliminating the shifts.
+
+  EVT VT = Root.getValueType();
+
+  if (Root.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N0 = Root.getOperand(0);
+  SDValue N1 = Root.getOperand(1);
+
+  if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1))
+    return SDValue();
+
+  APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal();
+  if (!RootMask.isMask())
+    return SDValue();
+
+  SDValue Src;
+  const auto IsSrc = [&](SDValue V) {
+    if (!Src) {
+      Src = V;
+      return true;
+    }
+
+    return Src == V;
+  };
+
+  SmallVector<SDValue> Worklist = {N0};
+  APInt PartsMask(VT.getSizeInBits(), 0);
+  while (!Worklist.empty()) {
+    SDValue V = Worklist.pop_back_val();
+    if (!V.hasOneUse() && Src != V)
+      return SDValue();
+
+    if (V.getOpcode() == ISD::OR) {
+      Worklist.push_back(V.getOperand(0));
+      Worklist.push_back(V.getOperand(1));
+      continue;
+    }
+
+    if (V.getOpcode() == ISD::SRL) {
+      SDValue ShiftSrc = V.getOperand(0);
+      SDValue ShiftAmt = V.getOperand(1);
+
+      if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt))
+        return SDValue();
+
+      PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal());
+      continue;
+    }
+
+    if (IsSrc(V)) {
+      PartsMask |= RootMask;
+      continue;
+    }
+
+    return SDValue();
+  }
+
+  if (!RootMask.isMask() || !Src)
+    return SDValue();
+
+  SDLoc DL(Root);
+  return DAG.getNode(ISD::AND, DL, VT,
+                     {Src, DAG.getConstant(PartsMask, DL, VT)});
+}
+
 /// This is a stub for TargetLowering::SimplifySetCC.
 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                    ISD::CondCode Cond, const SDLoc &DL,
                                    bool foldBooleans) {
   TargetLowering::DAGCombinerInfo
     DagCombineInfo(DAG, Level, false, this);
-  return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+  if (SDValue C =
+          TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL))
+    return C;
+
+  if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) &&
+      N0.getOpcode() == ISD::AND && isNullConstant(N1)) {
+
+    if (SDValue Res = matchMergedBFX(N0, DAG, TLI))
+      return DAG.getSetCC(DL, VT, Res, N1, Cond);
+  }
+
+  return SDValue();
 }
 
 /// Given an ISD::SDIV node expressing a divide by constant, return
diff --git a/llvm/test/CodeGen/AMDGPU/workitems-intrinsics-opts.ll b/llvm/test/CodeGen/AMDGPU/workitems-intrinsics-opts.ll
index 14120680216fc..5a25ec29af481 100644
--- a/llvm/test/CodeGen/AMDGPU/workitems-intrinsics-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitems-intrinsics-opts.ll
@@ -12,11 +12,7 @@ define i1 @workitem_zero() {
 ; DAGISEL-GFX9-LABEL: workitem_zero:
 ; DAGISEL-GFX9:       ; %bb.0: ; %entry
 ; DAGISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX9-NEXT:    v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX9-NEXT:    v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX9-NEXT:    v_or_b32_e32 v1, v31, v1
-; DAGISEL-GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; DAGISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v31
 ; DAGISEL-GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; DAGISEL-GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; DAGISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -24,10 +20,7 @@ define i1 @workitem_zero() {
 ; DAGISEL-GFX942-LABEL: workitem_zero:
 ; DAGISEL-GFX942:       ; %bb.0: ; %entry
 ; DAGISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX942-NEXT:    v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX942-NEXT:    v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX942-NEXT:    v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX942-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v31
 ; DAGISEL-GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; DAGISEL-GFX942-NEXT:    s_nop 1
 ; DAGISEL-GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -40,11 +33,7 @@ define i1 @workitem_zero() {
 ; DAGISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
 ; DAGISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL-GFX12-NEXT:    v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX12-NEXT:    v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; DAGISEL-GFX12-NEXT:    v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX12-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v31
 ; DAGISEL-GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL-GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffd
@@ -106,11 +95,7 @@ define i1 @workitem_nonzero() {
 ; DAGISEL-GFX9-LABEL: workitem_nonzero:
 ; DAGISEL-GFX9:       ; %bb.0: ; %entry
 ; DAGISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX9-NEXT:    v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX9-NEXT:    v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX9-NEXT:    v_or_b32_e32 v1, v31, v1
-; DAGISEL-GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; DAGISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v31
 ; DAGISEL-GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; DAGISEL-GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; DAGISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -118,10 +103,7 @@ define i1 @workitem_nonzero() {
 ; DAGISEL-GFX942-LABEL: workitem_nonzero:
 ; DAGISEL-GFX942:       ; %bb.0: ; %entry
 ; DAGISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX942-NEXT:    v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX942-NEXT:    v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX942-NEXT:    v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX942-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v31
 ; DAGISEL-GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; DAGISEL-GFX942-NEXT:    s_nop 1
 ; DAGISEL-GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -134,11 +116,7 @@ define i1 @workitem_nonzero() {
 ; DAGISEL-GFX12-NEXT:    s_wait_samplecnt 0x0
 ; DAGISEL-GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-GFX12-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL-GFX12-NEXT:    v_lshrrev_b32_e32 v0, 20, v31
-; DAGISEL-GFX12-NEXT:    v_lshrrev_b32_e32 v1, 10, v31
-; DAGISEL-GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; DAGISEL-GFX12-NEXT:    v_or3_b32 v0, v31, v1, v0
-; DAGISEL-GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; DAGISEL-GFX12-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v31
 ; DAGISEL-GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL-GFX12-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffd