[llvm] Reduce shl64 to shl32 if shift range is [63-32] (PR #125574)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 5 09:04:07 PST 2025
https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/125574
>From 16742f006dabc82ab2cd799d2e52830255f96d90 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 5 Feb 2025 11:01:30 -0600
Subject: [PATCH] Reduce shl64 to shl32 if shift range is [63-32]
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 32 ++++++---
llvm/test/CodeGen/AMDGPU/shl64_reduce.ll | 67 +++++++++++++++++++
2 files changed, 91 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 792e17eeedab141..d7c004e1308c7f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4040,19 +4040,35 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
- ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHS)
+ if (!CRHS) {
+ // shl i64 X, Y -> [0, shl i32 X, (Y - 32)]
+ if (VT == MVT::i64) {
+ KnownBits Known = DAG.computeKnownBits(RHS);
+ if (Known.getMinValue().getZExtValue() >= 32) {
+ SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, RHS);
+ const SDValue C32 = DAG.getConstant(32, SL, MVT::i32);
+ SDValue ShiftAmt =
+ DAG.getNode(ISD::SUB, SL, MVT::i32, truncShiftAmt, C32);
+ SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+ SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+ }
return SDValue();
+ }
- SDValue LHS = N->getOperand(0);
- unsigned RHSVal = RHS->getZExtValue();
+ unsigned RHSVal = CRHS->getZExtValue();
if (!RHSVal)
return LHS;
- SDLoc SL(N);
- SelectionDAG &DAG = DCI.DAG;
-
switch (LHS->getOpcode()) {
default:
break;
@@ -4078,7 +4094,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (LZ < RHSVal)
break;
EVT XVT = X.getValueType();
- SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
+ SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
return DAG.getZExtOrTrunc(Shl, SL, VT);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
new file mode 100644
index 000000000000000..d18dc2bf3731011
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll
@@ -0,0 +1,67 @@
+;; Test reduction of:
+;;
+;; DST = shl i64 X, Y
+;;
+;; where Y is in the range [63-32] to:
+;;
+;; DST = [0, shl i32 X, (Y - 32)]
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+; FIXME: This case should be reduced, but SelectionDAG::computeKnownBits() cannot
+; determine the minimum from metadata in this case. Match current results
+; for now.
+define i64 @shl_metadata(i64 noundef %arg0, ptr %arg1.ptr) {
+ %shift.amt = load i64, ptr %arg1.ptr, !range !0
+ %shl = shl i64 %arg0, %shift.amt
+ ret i64 %shl
+
+; CHECK: .globl shl_metadata
+; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
+}
+
+!0 = !{i64 32, i64 64}
+
+; This case is reduced because computeKnownBits() can calculates a minimum of 32
+; based on the OR with 32.
+define i64 @shl_or32(i64 noundef %arg0, ptr %arg1.ptr) {
+ %shift.amt = load i64, ptr %arg1.ptr
+ %or = or i64 %shift.amt, 32
+ %shl = shl i64 %arg0, %or
+ ret i64 %shl
+
+; CHECK: .globl shl_or32
+; CHECK: v_or_b32_e32 v1, 32, v1
+; CHECK: v_subrev_i32_e32 v1, vcc, 32, v1
+; CHECK: v_lshlrev_b32_e32 v1, v1, v0
+; CHECK: v_mov_b32_e32 v0, 0
+}
+
+; This case must not be reduced because the known minimum, 16, is not in range.
+define i64 @shl_or16(i64 noundef %arg0, ptr %arg1.ptr) {
+ %shift.amt = load i64, ptr %arg1.ptr
+ %or = or i64 %shift.amt, 16
+ %shl = shl i64 %arg0, %or
+ ret i64 %shl
+
+; CHECK: .globl shl_or16
+; CHECK: v_or_b32_e32 v2, 16, v2
+; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
+}
+
+; FIXME: This case should be reduced too, but computeKnownBits() cannot
+; determine the range. Match current results for now.
+define i64 @shl_maxmin(i64 noundef %arg0, i64 noundef %arg1) {
+ %max = call i64 @llvm.umax.i64(i64 %arg1, i64 32)
+ %min = call i64 @llvm.umin.i64(i64 %max, i64 63)
+ %shl = shl i64 %arg0, %min
+ ret i64 %shl
+
+; CHECK: .globl shl_maxmin
+; CHECK: v_cmp_lt_u64_e32 vcc, 32, v[2:3]
+; CHECK: v_cndmask_b32_e32 v3, 0, v3, vcc
+; CHECK: v_cndmask_b32_e32 v2, 32, v2, vcc
+; CHECK: v_cmp_gt_u64_e32 vcc, 63, v[2:3]
+; CHECK: v_cndmask_b32_e32 v2, 63, v2, vcc
+; CHECK: v_lshl_b64 v[0:1], v[0:1], v2
+}
More information about the llvm-commits
mailing list