[llvm] [DAGCombiner] Attempt to fold 'add' nodes to funnel-shift or rotate (PR #125612)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 13:01:40 PST 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/125612
>From 605ac34aae8677fb09885e455fd6765c1dae31a7 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 4 Feb 2025 17:39:18 +0000
Subject: [PATCH 1/4] pre-commit tests
---
llvm/test/CodeGen/NVPTX/add-rotate.ll | 172 ++++++++++++++++++++++++++
1 file changed, 172 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/add-rotate.ll
diff --git a/llvm/test/CodeGen/NVPTX/add-rotate.ll b/llvm/test/CodeGen/NVPTX/add-rotate.ll
new file mode 100644
index 0000000000000..b7430f284ddcb
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/add-rotate.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_rotl(i32 %x) {
+; CHECK-LABEL: test_rotl(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_param_0];
+; CHECK-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %shl = shl i32 %x, 7
+ %shr = lshr i32 %x, 25
+ %add = add i32 %shl, %shr
+ ret i32 %add
+}
+
+define i32 @test_rotr(i32 %x) {
+; CHECK-LABEL: test_rotr(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_param_0];
+; CHECK-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 25;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %shr = lshr i32 %x, 7
+ %shl = shl i32 %x, 25
+ %add = add i32 %shr, %shl
+ ret i32 %add
+}
+
+define i32 @test_rotl_var(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotl_var(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_var_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_rotl_var_param_1];
+; CHECK-NEXT: shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT: sub.s32 %r4, 32, %r2;
+; CHECK-NEXT: shr.u32 %r5, %r1, %r4;
+; CHECK-NEXT: add.s32 %r6, %r3, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %shl = shl i32 %x, %y
+ %sub = sub i32 32, %y
+ %shr = lshr i32 %x, %sub
+ %add = add i32 %shl, %shr
+ ret i32 %add
+}
+
+define i32 @test_rotr_var(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotr_var(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_var_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_rotr_var_param_1];
+; CHECK-NEXT: shr.u32 %r3, %r1, %r2;
+; CHECK-NEXT: sub.s32 %r4, 32, %r2;
+; CHECK-NEXT: shl.b32 %r5, %r1, %r4;
+; CHECK-NEXT: add.s32 %r6, %r3, %r5;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: ret;
+ %shr = lshr i32 %x, %y
+ %sub = sub i32 32, %y
+ %shl = shl i32 %x, %sub
+ %add = add i32 %shr, %shl
+ ret i32 %add
+}
+
+define i32 @test_rotl_var_and(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotl_var_and(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_var_and_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_rotl_var_and_param_1];
+; CHECK-NEXT: shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT: neg.s32 %r4, %r2;
+; CHECK-NEXT: and.b32 %r5, %r4, 31;
+; CHECK-NEXT: shr.u32 %r6, %r1, %r5;
+; CHECK-NEXT: add.s32 %r7, %r6, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT: ret;
+ %shr = shl i32 %x, %y
+ %sub = sub nsw i32 0, %y
+ %and = and i32 %sub, 31
+ %shl = lshr i32 %x, %and
+ %add = add i32 %shl, %shr
+ ret i32 %add
+}
+
+define i32 @test_rotr_var_and(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotr_var_and(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_var_and_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_rotr_var_and_param_1];
+; CHECK-NEXT: shr.u32 %r3, %r1, %r2;
+; CHECK-NEXT: neg.s32 %r4, %r2;
+; CHECK-NEXT: and.b32 %r5, %r4, 31;
+; CHECK-NEXT: shl.b32 %r6, %r1, %r5;
+; CHECK-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT: ret;
+ %shr = lshr i32 %x, %y
+ %sub = sub nsw i32 0, %y
+ %and = and i32 %sub, 31
+ %shl = shl i32 %x, %and
+ %add = add i32 %shr, %shl
+ ret i32 %add
+}
+
+define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
+; CHECK-LABEL: test_fshl_special_case(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_special_case_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_2];
+; CHECK-NEXT: shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT: ld.param.u32 %r4, [test_fshl_special_case_param_1];
+; CHECK-NEXT: shr.u32 %r5, %r4, 1;
+; CHECK-NEXT: xor.b32 %r6, %r2, 31;
+; CHECK-NEXT: shr.u32 %r7, %r5, %r6;
+; CHECK-NEXT: add.s32 %r8, %r3, %r7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ret;
+ %shl = shl i32 %x0, %y
+ %srli = lshr i32 %x1, 1
+ %x = xor i32 %y, 31
+ %srlo = lshr i32 %srli, %x
+ %o = add i32 %shl, %srlo
+ ret i32 %o
+}
+
+define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
+; CHECK-LABEL: test_fshr_special_case(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_fshr_special_case_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [test_fshr_special_case_param_1];
+; CHECK-NEXT: ld.param.u32 %r3, [test_fshr_special_case_param_2];
+; CHECK-NEXT: shr.u32 %r4, %r2, %r3;
+; CHECK-NEXT: shl.b32 %r5, %r1, 1;
+; CHECK-NEXT: xor.b32 %r6, %r3, 31;
+; CHECK-NEXT: shl.b32 %r7, %r5, %r6;
+; CHECK-NEXT: add.s32 %r8, %r4, %r7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ret;
+ %shl = lshr i32 %x1, %y
+ %srli = shl i32 %x0, 1
+ %x = xor i32 %y, 31
+ %srlo = shl i32 %srli, %x
+ %o = add i32 %shl, %srlo
+ ret i32 %o
+}
>From e128108a9f4fc1b85b52f7a79138f1ae62b092c8 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 4 Feb 2025 17:39:29 +0000
Subject: [PATCH 2/4] [DAGCombiner] Attempt to fold 'add' nodes to funnel-shift
or rotate
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 86 +++++++++----------
llvm/test/CodeGen/NVPTX/add-rotate.ll | 42 +++------
2 files changed, 57 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f4caaf426de6a..793a174ebb1ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -662,14 +662,15 @@ namespace {
bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
- SDValue InnerPos, SDValue InnerNeg, bool HasPos,
- unsigned PosOpcode, unsigned NegOpcode,
- const SDLoc &DL);
+ SDValue InnerPos, SDValue InnerNeg, bool FromAdd,
+ bool HasPos, unsigned PosOpcode,
+ unsigned NegOpcode, const SDLoc &DL);
SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
- SDValue InnerPos, SDValue InnerNeg, bool HasPos,
- unsigned PosOpcode, unsigned NegOpcode,
- const SDLoc &DL);
- SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+ SDValue InnerPos, SDValue InnerNeg, bool FromAdd,
+ bool HasPos, unsigned PosOpcode,
+ unsigned NegOpcode, const SDLoc &DL);
+ SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
+ bool FromAdd);
SDValue MatchLoadCombine(SDNode *N);
SDValue mergeTruncStores(StoreSDNode *N);
SDValue reduceLoadWidth(SDNode *N);
@@ -2992,6 +2993,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
return V;
+ if (SDValue V = MatchRotate(N0, N1, SDLoc(N), /*FromAdd=*/true))
+ return V;
+
// Try to match AVGFLOOR fixedwidth pattern
if (SDValue V = foldAddToAvg(N, DL))
return V;
@@ -8161,7 +8165,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
return V;
// See if this is some rotate idiom.
- if (SDValue Rot = MatchRotate(N0, N1, DL))
+ if (SDValue Rot = MatchRotate(N0, N1, DL, /*FromAdd=*/false))
return Rot;
if (SDValue Load = MatchLoadCombine(N))
@@ -8350,7 +8354,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
// The IsRotate flag should be set when the LHS of both shifts is the same.
// Otherwise if matching a general funnel shift, it should be clear.
static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
- SelectionDAG &DAG, bool IsRotate) {
+ SelectionDAG &DAG, bool IsRotate, bool FromAdd) {
const auto &TLI = DAG.getTargetLoweringInfo();
// If EltSize is a power of 2 then:
//
@@ -8389,7 +8393,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
// NOTE: We can only do this when matching operations which won't modify the
// least Log2(EltSize) significant bits and not a general funnel shift.
unsigned MaskLoBits = 0;
- if (IsRotate && isPowerOf2_64(EltSize)) {
+ if (IsRotate && !FromAdd && isPowerOf2_64(EltSize)) {
unsigned Bits = Log2_64(EltSize);
unsigned NegBits = Neg.getScalarValueSizeInBits();
if (NegBits >= Bits) {
@@ -8472,9 +8476,9 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
// Neg with outer conversions stripped away.
SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
SDValue Neg, SDValue InnerPos,
- SDValue InnerNeg, bool HasPos,
- unsigned PosOpcode, unsigned NegOpcode,
- const SDLoc &DL) {
+ SDValue InnerNeg, bool FromAdd,
+ bool HasPos, unsigned PosOpcode,
+ unsigned NegOpcode, const SDLoc &DL) {
// fold (or (shl x, (*ext y)),
// (srl x, (*ext (sub 32, y)))) ->
// (rotl x, y) or (rotr x, (sub 32, y))
@@ -8484,10 +8488,9 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
// (rotr x, y) or (rotl x, (sub 32, y))
EVT VT = Shifted.getValueType();
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
- /*IsRotate*/ true)) {
+ /*IsRotate*/ true, FromAdd))
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
HasPos ? Pos : Neg);
- }
return SDValue();
}
@@ -8500,9 +8503,9 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
// TODO: Merge with MatchRotatePosNeg.
SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
SDValue Neg, SDValue InnerPos,
- SDValue InnerNeg, bool HasPos,
- unsigned PosOpcode, unsigned NegOpcode,
- const SDLoc &DL) {
+ SDValue InnerNeg, bool FromAdd,
+ bool HasPos, unsigned PosOpcode,
+ unsigned NegOpcode, const SDLoc &DL) {
EVT VT = N0.getValueType();
unsigned EltBits = VT.getScalarSizeInBits();
@@ -8513,10 +8516,10 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
// fold (or (shl x0, (*ext (sub 32, y))),
// (srl x1, (*ext y))) ->
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
- if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
+ if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1,
+ FromAdd))
return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
HasPos ? Pos : Neg);
- }
// Matching the shift+xor cases, we can't easily use the xor'd shift amount
// so for now just use the PosOpcode case if its legal.
@@ -8561,11 +8564,12 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
return SDValue();
}
-// MatchRotate - Handle an 'or' of two operands. If this is one of the many
-// idioms for rotate, and if the target supports rotation instructions, generate
-// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
-// with different shifted sources.
-SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
+// MatchRotate - Handle an 'or' or 'add' of two operands. If this is one of the
+// many idioms for rotate, and if the target supports rotation instructions,
+// generate a rot[lr]. This also matches funnel shift patterns, similar to
+// rotation but with different shifted sources.
+SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
+ bool FromAdd) {
EVT VT = LHS.getValueType();
// The target must have at least one rotate/funnel flavor.
@@ -8592,9 +8596,9 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
assert(LHS.getValueType() == RHS.getValueType());
- if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
+ if (SDValue Rot =
+ MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL, FromAdd))
return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
- }
}
// Match "(X shl/srl V1) & V2" where V2 may not be present.
@@ -8774,29 +8778,25 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
}
if (IsRotate && (HasROTL || HasROTR)) {
- SDValue TryL =
- MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
- RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
- if (TryL)
+ if (SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+ LExtOp0, RExtOp0, FromAdd, HasROTL,
+ ISD::ROTL, ISD::ROTR, DL))
return TryL;
- SDValue TryR =
- MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
- LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
- if (TryR)
+ if (SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+ RExtOp0, LExtOp0, FromAdd, HasROTR,
+ ISD::ROTR, ISD::ROTL, DL))
return TryR;
}
- SDValue TryL =
- MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
- LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
- if (TryL)
+ if (SDValue TryL = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt,
+ RHSShiftAmt, LExtOp0, RExtOp0, FromAdd,
+ HasFSHL, ISD::FSHL, ISD::FSHR, DL))
return TryL;
- SDValue TryR =
- MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
- RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
- if (TryR)
+ if (SDValue TryR = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt,
+ LHSShiftAmt, RExtOp0, LExtOp0, FromAdd,
+ HasFSHR, ISD::FSHR, ISD::FSHL, DL))
return TryR;
return SDValue();
diff --git a/llvm/test/CodeGen/NVPTX/add-rotate.ll b/llvm/test/CodeGen/NVPTX/add-rotate.ll
index b7430f284ddcb..b25d4a2bd8001 100644
--- a/llvm/test/CodeGen/NVPTX/add-rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/add-rotate.ll
@@ -38,16 +38,13 @@ define i32 @test_rotr(i32 %x) {
define i32 @test_rotl_var(i32 %x, i32 %y) {
; CHECK-LABEL: test_rotl_var(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_var_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test_rotl_var_param_1];
-; CHECK-NEXT: shl.b32 %r3, %r1, %r2;
-; CHECK-NEXT: sub.s32 %r4, 32, %r2;
-; CHECK-NEXT: shr.u32 %r5, %r1, %r4;
-; CHECK-NEXT: add.s32 %r6, %r3, %r5;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%shl = shl i32 %x, %y
%sub = sub i32 32, %y
@@ -59,16 +56,13 @@ define i32 @test_rotl_var(i32 %x, i32 %y) {
define i32 @test_rotr_var(i32 %x, i32 %y) {
; CHECK-LABEL: test_rotr_var(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<7>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_var_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test_rotr_var_param_1];
-; CHECK-NEXT: shr.u32 %r3, %r1, %r2;
-; CHECK-NEXT: sub.s32 %r4, 32, %r2;
-; CHECK-NEXT: shl.b32 %r5, %r1, %r4;
-; CHECK-NEXT: add.s32 %r6, %r3, %r5;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT: shf.r.wrap.b32 %r3, %r1, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%shr = lshr i32 %x, %y
%sub = sub i32 32, %y
@@ -126,18 +120,14 @@ define i32 @test_rotr_var_and(i32 %x, i32 %y) {
define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
; CHECK-LABEL: test_fshl_special_case(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_special_case_param_0];
-; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_2];
-; CHECK-NEXT: shl.b32 %r3, %r1, %r2;
-; CHECK-NEXT: ld.param.u32 %r4, [test_fshl_special_case_param_1];
-; CHECK-NEXT: shr.u32 %r5, %r4, 1;
-; CHECK-NEXT: xor.b32 %r6, %r2, 31;
-; CHECK-NEXT: shr.u32 %r7, %r5, %r6;
-; CHECK-NEXT: add.s32 %r8, %r3, %r7;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_1];
+; CHECK-NEXT: ld.param.u32 %r3, [test_fshl_special_case_param_2];
+; CHECK-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
%shl = shl i32 %x0, %y
%srli = lshr i32 %x1, 1
@@ -150,18 +140,14 @@ define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
; CHECK-LABEL: test_fshr_special_case(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<9>;
+; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_fshr_special_case_param_0];
; CHECK-NEXT: ld.param.u32 %r2, [test_fshr_special_case_param_1];
; CHECK-NEXT: ld.param.u32 %r3, [test_fshr_special_case_param_2];
-; CHECK-NEXT: shr.u32 %r4, %r2, %r3;
-; CHECK-NEXT: shl.b32 %r5, %r1, 1;
-; CHECK-NEXT: xor.b32 %r6, %r3, 31;
-; CHECK-NEXT: shl.b32 %r7, %r5, %r6;
-; CHECK-NEXT: add.s32 %r8, %r4, %r7;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
%shl = lshr i32 %x1, %y
%srli = shl i32 %x0, 1
>From cfc06a6b046f5a28b1080df7fc0299a238cbea55 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 14 Feb 2025 01:44:45 +0000
Subject: [PATCH 3/4] address comments
---
llvm/test/CodeGen/NVPTX/add-rotate.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/NVPTX/add-rotate.ll b/llvm/test/CodeGen/NVPTX/add-rotate.ll
index b25d4a2bd8001..091ba7bcba32a 100644
--- a/llvm/test/CodeGen/NVPTX/add-rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/add-rotate.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: llc < %s -mcpu=sm_50 | FileCheck %s
target triple = "nvptx64-nvidia-cuda"
>From 982d35bcf7091fd26719919eaee87f9f5625a405 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 19 Feb 2025 21:01:25 +0000
Subject: [PATCH 4/4] address comment comment
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 30 +++++++++----------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 793a174ebb1ea..137047f3521e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8479,12 +8479,12 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
SDValue InnerNeg, bool FromAdd,
bool HasPos, unsigned PosOpcode,
unsigned NegOpcode, const SDLoc &DL) {
- // fold (or (shl x, (*ext y)),
- // (srl x, (*ext (sub 32, y)))) ->
+ // fold (or/add (shl x, (*ext y)),
+ // (srl x, (*ext (sub 32, y)))) ->
// (rotl x, y) or (rotr x, (sub 32, y))
//
- // fold (or (shl x, (*ext (sub 32, y))),
- // (srl x, (*ext y))) ->
+ // fold (or/add (shl x, (*ext (sub 32, y))),
+ // (srl x, (*ext y))) ->
// (rotr x, y) or (rotl x, (sub 32, y))
EVT VT = Shifted.getValueType();
if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
@@ -8509,12 +8509,12 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
EVT VT = N0.getValueType();
unsigned EltBits = VT.getScalarSizeInBits();
- // fold (or (shl x0, (*ext y)),
- // (srl x1, (*ext (sub 32, y)))) ->
+ // fold (or/add (shl x0, (*ext y)),
+ // (srl x1, (*ext (sub 32, y)))) ->
// (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
//
- // fold (or (shl x0, (*ext (sub 32, y))),
- // (srl x1, (*ext y))) ->
+ // fold (or/add (shl x0, (*ext (sub 32, y))),
+ // (srl x1, (*ext y))) ->
// (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1,
FromAdd))
@@ -8532,7 +8532,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
return Cst && (Cst->getAPIntValue() == Imm);
};
- // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
+ // fold (or/add (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
// -> (fshl x0, x1, y)
if (IsBinOpImm(N1, ISD::SRL, 1) &&
IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
@@ -8541,7 +8541,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
}
- // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
+ // fold (or/add (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
// -> (fshr x0, x1, y)
if (IsBinOpImm(N0, ISD::SHL, 1) &&
IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
@@ -8550,7 +8550,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
}
- // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
+ // fold (or/add (shl (add x0, x0), (xor y, 31)), (srl x1, y))
// -> (fshr x0, x1, y)
// TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
@@ -8732,10 +8732,10 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
return SDValue(); // Requires funnel shift support.
}
- // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
- // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
- // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
- // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
+ // fold (or/add (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+ // fold (or/add (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+ // fold (or/add (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
+ // fold (or/add (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
// iff C1+C2 == EltSizeInBits
if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
SDValue Res;
More information about the llvm-commits
mailing list