[llvm] Reduce shl64 to shl32 if shift range is [63-32] (PR #125574)

Wed Feb 5 09:04:07 PST 2025

https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/125574

>From 16742f006dabc82ab2cd799d2e52830255f96d90 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 5 Feb 2025 11:01:30 -0600
Subject: [PATCH] Reduce shl64 to shl32 if shift range is [63-32]

Signed-off-by: John Lu <John.Lu at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 32 ++++++---
 llvm/test/CodeGen/AMDGPU/shl64_reduce.ll      | 67 +++++++++++++++++++
 2 files changed, 91 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shl64_reduce.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 792e17eeedab141..d7c004e1308c7f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4040,19 +4040,35 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
 
-  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS)
+  if (!CRHS) {
+    // shl i64 X, Y -> [0, shl i32 X, (Y - 32)]
+    if (VT == MVT::i64) {
+      KnownBits Known = DAG.computeKnownBits(RHS);
+      if (Known.getMinValue().getZExtValue() >= 32) {
+        SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, RHS);
+        const SDValue C32 = DAG.getConstant(32, SL, MVT::i32);
+        SDValue ShiftAmt =
+            DAG.getNode(ISD::SUB, SL, MVT::i32, truncShiftAmt, C32);
+        SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+        SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
+        const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+        SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+        return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+      }
+    }
     return SDValue();
+  }
 
-  SDValue LHS = N->getOperand(0);
-  unsigned RHSVal = RHS->getZExtValue();
+  unsigned RHSVal = CRHS->getZExtValue();
   if (!RHSVal)
     return LHS;
 
-  SDLoc SL(N);
-  SelectionDAG &DAG = DCI.DAG;
-
   switch (LHS->getOpcode()) {
   default:
     break;
@@ -4078,7 +4094,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     if (LZ < RHSVal)
       break;
     EVT XVT = X.getValueType();
-    SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
+    SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
     return DAG.getZExtOrTrunc(Shl, SL, VT);
   }
   }
diff --git a/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
new file mode 100644
index 000000000000000..d18dc2bf3731011
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
@@ -0,0 +1,67 @@
+;; Test reduction of:
+;;
+;;   DST = shl i64 X, Y
+;;
+;; where Y is in the range [63-32] to:
+;;
+;;   DST = [0, shl i32 X, (Y - 32)]
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; FIXME: This case should be reduced, but SelectionDAG::computeKnownBits() cannot
+;        determine the minimum from metadata in this case.  Match current results
+;        for now.
+define i64 @shl_metadata(i64 noundef %arg0, ptr %arg1.ptr) {
+  %shift.amt = load i64, ptr %arg1.ptr, !range !0
+  %shl = shl i64 %arg0, %shift.amt
+  ret i64 %shl
+
+; CHECK: .globl  shl_metadata
+; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
+}
+
+!0 = !{i64 32, i64 64}
+
+; This case is reduced because computeKnownBits() can calculates a minimum of 32
+; based on the OR with 32.
+define i64 @shl_or32(i64 noundef %arg0, ptr %arg1.ptr) {
+  %shift.amt = load i64, ptr %arg1.ptr
+  %or = or i64 %shift.amt, 32
+  %shl = shl i64 %arg0, %or
+  ret i64 %shl
+
+; CHECK: .globl  shl_or32
+; CHECK: v_or_b32_e32 v1, 32, v1
+; CHECK: v_subrev_i32_e32 v1, vcc, 32, v1
+; CHECK: v_lshlrev_b32_e32 v1, v1, v0
+; CHECK: v_mov_b32_e32 v0, 0
+}
+
+; This case must not be reduced because the known minimum, 16, is not in range.
+define i64 @shl_or16(i64 noundef %arg0, ptr %arg1.ptr) {
+  %shift.amt = load i64, ptr %arg1.ptr
+  %or = or i64 %shift.amt, 16
+  %shl = shl i64 %arg0, %or
+  ret i64 %shl
+
+; CHECK: .globl  shl_or16
+; CHECK: v_or_b32_e32 v2, 16, v2
+; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
+}
+
+; FIXME: This case should be reduced too, but computeKnownBits() cannot
+;        determine the range.  Match current results for now.
+define i64 @shl_maxmin(i64 noundef %arg0, i64 noundef %arg1) {
+  %max = call i64 @llvm.umax.i64(i64 %arg1, i64 32)
+  %min = call i64 @llvm.umin.i64(i64 %max,  i64 63)
+  %shl = shl i64 %arg0, %min
+  ret i64 %shl
+
+; CHECK: .globl  shl_maxmin
+; CHECK: v_cmp_lt_u64_e32 vcc, 32, v[2:3]
+; CHECK: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK: v_cndmask_b32_e32 v2, 32, v2, vcc
+; CHECK: v_cmp_gt_u64_e32 vcc, 63, v[2:3]
+; CHECK: v_cndmask_b32_e32 v2, 63, v2, vcc
+; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
+}