[llvm] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT operations. (PR #112647)

Thu Oct 17 22:28:12 PDT 2024

https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/112647

>From 1822dab3f3a4b3634bece0edd095c14b8f502a56 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 09:59:03 +0800
Subject: [PATCH 1/6] [AMDGPU] Implement hasAndNot for scalar bitwise AND-NOT
 operations.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0f65df0763cc83..b746b94a60be21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3721,6 +3721,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   return DAG.getBuildVector(VT, DL, Args);
 }
 
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+  if (Op->isDivergent())
+    return false;
+
+  EVT VT = Op.getValueType();
+  return VT == MVT::i32 || VT == MVT::i64;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b2fd31cb2346eb..1289458570358b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,6 +99,8 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
+  bool hasAndNot(SDValue Y) const override;
+
 protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;

>From 43a26e7424299296ec1aaa1067d99a3185c4d294 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 10:31:26 +0800
Subject: [PATCH 2/6] [AMDGPU] Update value name.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 1289458570358b..d05b8901c0cb6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,7 +99,7 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
-  bool hasAndNot(SDValue Y) const override;
+  bool hasAndNot(SDValue Op) const override;
 
 protected:
   bool shouldCombineMemoryType(EVT VT) const;

>From b873ad212233965cf7054fe13ca43696957cd2cb Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:07:09 +0800
Subject: [PATCH 3/6] [AMDGPU] Update patch.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 ++++++++--------
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  9 +++++++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b746b94a60be21..09860cfff96fc2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3721,14 +3721,6 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   return DAG.getBuildVector(VT, DL, Args);
 }
 
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
-  if (Op->isDivergent())
-    return false;
-
-  EVT VT = Op.getValueType();
-  return VT == MVT::i32 || VT == MVT::i64;
-}
-
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -6051,3 +6043,11 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
                                                Register N0, Register N1) const {
   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
 }
+
+bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
+  if (Op->isDivergent())
+    return false;
+
+  EVT VT = Op.getValueType();
+  return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index d05b8901c0cb6e..952c95eba63760 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -99,8 +99,6 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
-  bool hasAndNot(SDValue Op) const override;
-
 protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -389,6 +387,13 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  /// Return true if the target supports a bitwise and-not operation:
+  /// X = ~A & B
+  /// This function checks if the operation can be directly mapped to the
+  /// target's native instructions, potentially simplifying select or other
+  /// related instructions by using more efficient hardware-specific operations.
+  bool hasAndNot(SDValue Op) const override;
 };
 
 namespace AMDGPUISD {

>From e75dde769d705388720e6d967d51ed319d87cc13 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 12:54:13 +0800
Subject: [PATCH 4/6] [AMDGPU] Move to SIISelLowering.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 8 --------
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 7 -------
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 8 ++++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h       | 7 +++++++
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 09860cfff96fc2..0f65df0763cc83 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -6043,11 +6043,3 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
                                                Register N0, Register N1) const {
   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
 }
-
-bool AMDGPUTargetLowering::hasAndNot(SDValue Op) const {
-  if (Op->isDivergent())
-    return false;
-
-  EVT VT = Op.getValueType();
-  return VT == MVT::i32 || VT == MVT::i64;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 952c95eba63760..b2fd31cb2346eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -387,13 +387,6 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
-
-  /// Return true if the target supports a bitwise and-not operation:
-  /// X = ~A & B
-  /// This function checks if the operation can be directly mapped to the
-  /// target's native instructions, potentially simplifying select or other
-  /// related instructions by using more efficient hardware-specific operations.
-  bool hasAndNot(SDValue Op) const override;
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index de9173e923ab5c..f59f25c7117491 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16890,3 +16890,11 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   AI->eraseFromParent();
   return LI;
 }
+
+bool SITargetLowering::hasAndNot(SDValue Op) const {
+  if (Op->isDivergent())
+    return false;
+
+  EVT VT = Op.getValueType();
+  return VT == MVT::i32 || VT == MVT::i64;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..b2094ad7993316 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -598,6 +598,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   MachineMemOperand::Flags
   getTargetMMOFlags(const Instruction &I) const override;
+
+  /// Return true if the target supports a bitwise and-not operation:
+  /// X = ~A & B
+  /// This function checks if the operation can be directly mapped to the
+  /// target's native instructions, potentially simplifying select or other
+  /// related instructions by using more efficient hardware-specific operations.
+  bool hasAndNot(SDValue Op) const override;
 };
 
 // Returns true if argument is a boolean value which is not serialized into

>From 1417d7158afe057e2f1e1a00d2ff2ca2302a90f6 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 17 Oct 2024 14:22:34 +0800
Subject: [PATCH 5/6] [AMDGPU] Update comments.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.h   | 6 ------
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f59f25c7117491..660731e9ce446d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16892,6 +16892,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 }
 
 bool SITargetLowering::hasAndNot(SDValue Op) const {
+  // Return false if the operation is divergent, as AND-NOT optimization
+  // requires uniform behavior across threads.
   if (Op->isDivergent())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index b2094ad7993316..a07ee1641d9ae2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -598,12 +598,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   MachineMemOperand::Flags
   getTargetMMOFlags(const Instruction &I) const override;
-
-  /// Return true if the target supports a bitwise and-not operation:
-  /// X = ~A & B
-  /// This function checks if the operation can be directly mapped to the
-  /// target's native instructions, potentially simplifying select or other
-  /// related instructions by using more efficient hardware-specific operations.
   bool hasAndNot(SDValue Op) const override;
 };
 

>From 2c9aa28f7548634817ba18871bbcf42d6885c4b3 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Fri, 18 Oct 2024 13:21:18 +0800
Subject: [PATCH 6/6] [AMDGPU] Add a lit test for hasAndNot.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 87 +++++++++++++++++++++--
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |  1 +
 llvm/test/CodeGen/AMDGPU/andornot.ll      | 39 ++++++++++
 3 files changed, 122 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/andornot.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 660731e9ce446d..b78168131edc13 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6822,6 +6822,81 @@ static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
   }
 }
 
+SDValue SITargetLowering::combineAnd(SDValue Op,
+                                                DAGCombinerInfo &DCI) const {
+  const unsigned Opc = Op.getOpcode();
+  assert(Opc == ISD::AND);
+
+  auto &DAG = DCI.DAG;
+  SDLoc DL(Op);
+
+  if(hasAndNot(Op)) {
+    SDValue LHS = Op->getOperand(0);
+    SDValue RHS = Op->getOperand(1);
+
+    // (and LHS, (or Y, ~Z))
+    if (RHS.getOpcode() == ISD::OR && RHS.hasOneUse()) {
+      SDValue Y = RHS->getOperand(0);
+      SDValue NotZ = RHS->getOperand(1);
+
+      if (NotZ.getOpcode() == ISD::XOR && isAllOnesConstant(NotZ->getOperand(1))) {
+        SDValue Z = NotZ->getOperand(0);
+
+        if (!isa<ConstantSDNode>(Y)) {
+          SDValue NotY = DAG.getNOT(DL, Y, Y.getValueType());
+          SDValue AndNotYZ = DAG.getNode(ISD::AND, DL, Y.getValueType(), NotY, Z);
+          SDValue NotAndNotYZ = DAG.getNOT(DL, AndNotYZ, AndNotYZ.getValueType());
+          SDValue NewAnd = DAG.getNode(ISD::AND, DL, Op.getValueType(), LHS, NotAndNotYZ);
+          return NewAnd;
+        }
+      }
+    }
+  }
+    
+  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+                                 : Op->getOperand(0).getValueType();
+  auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+  if (DCI.isBeforeLegalizeOps() ||
+      isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
+    return SDValue();
+
+  SDValue LHS;
+  SDValue RHS;
+  if (Opc == ISD::SELECT) {
+    LHS = Op->getOperand(1);
+    RHS = Op->getOperand(2);
+  } else {
+    LHS = Op->getOperand(0);
+    RHS = Op->getOperand(1);
+  }
+
+  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+
+  // Special case: for shifts, the RHS always needs a zext.
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+    RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
+  else
+    RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+  // setcc always return i1/i1 vec so no need to truncate after.
+  if (Opc == ISD::SETCC) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+  }
+
+  // For other ops, we extend the operation's return type as well so we need to
+  // truncate back to the original type.
+  SDValue NewVal;
+  if (Opc == ISD::SELECT)
+    NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
+  else
+    NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
+
+  return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
+}
+
 SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
                                                 DAGCombinerInfo &DCI) const {
   const unsigned Opc = Op.getOpcode();
@@ -14797,16 +14872,19 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
   return SDValue(CSrc, 0);
 }
 
-
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
+  case ISD::AND:
+    if (auto Res = combineAnd(SDValue(N, 0), DCI))
+      return Res;
+    break;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:
-  case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
   case ISD::MUL:
@@ -14910,7 +14988,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CLAMP:
     return performClampCombine(N, DCI);
   case ISD::SCALAR_TO_VECTOR: {
-    SelectionDAG &DAG = DCI.DAG;
     EVT VT = N->getValueType(0);
 
     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
@@ -16892,8 +16969,8 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 }
 
 bool SITargetLowering::hasAndNot(SDValue Op) const {
-  // Return false if the operation is divergent, as AND-NOT optimization
-  // requires uniform behavior across threads.
+  // Return false if the operation is divergent, as AND-NOT is a scalar-only
+  // instruction.
   if (Op->isDivergent())
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a07ee1641d9ae2..4c90f4da42006b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue combineAnd(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/andornot.ll b/llvm/test/CodeGen/AMDGPU/andornot.ll
new file mode 100644
index 00000000000000..821709847ab8d5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andornot.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i16
+; GCN: s_not_b32
+; GCN-NEXT: s_lshr_b32
+; GCN-NEXT: s_and_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i16(ptr addrspace(1) %out, i16 %x, i16 %y, i16 %z) {
+entry:
+  %not_z = xor i16 %z, -1
+  %or_y_not_z = or i16 %y, %not_z
+  %and_result = and i16 %x, %or_y_not_z
+  store i16 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i32
+; GCN: s_andn2_b32
+; GCN-NEXT: s_andn2_b32
+define amdgpu_kernel void @scalar_and_or_not_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
+entry:
+  %not_z = xor i32 %z, -1
+  %or_y_not_z = or i32 %y, %not_z
+  %and_result = and i32 %x, %or_y_not_z
+  store i32 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}scalar_and_or_not_i64
+; GCN: s_andn2_b64
+; GCN-NEXT: s_andn2_b64
+define amdgpu_kernel void @scalar_and_or_not_i64(ptr addrspace(1) %out, i64 %x, i64 %y, i64 %z) {
+entry:
+  %not_z = xor i64 %z, -1
+  %or_y_not_z = or i64 %y, %not_z
+  %and_result = and i64 %x, %or_y_not_z
+  store i64 %and_result, ptr addrspace(1) %out, align 4
+  ret void
+}