[llvm] [DAGCombiner] Attempt to fold 'add' nodes to funnel-shift or rotate (PR #125612)

Wed Feb 19 13:01:40 PST 2025

https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/125612

>From 605ac34aae8677fb09885e455fd6765c1dae31a7 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 4 Feb 2025 17:39:18 +0000
Subject: [PATCH 1/4] pre-commit tests

---
 llvm/test/CodeGen/NVPTX/add-rotate.ll | 172 ++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/add-rotate.ll

diff --git a/llvm/test/CodeGen/NVPTX/add-rotate.ll b/llvm/test/CodeGen/NVPTX/add-rotate.ll
new file mode 100644
index 0000000000000..b7430f284ddcb
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/add-rotate.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_rotl(i32 %x) {
+; CHECK-LABEL: test_rotl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_rotl_param_0];
+; CHECK-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %shl = shl i32 %x, 7
+  %shr = lshr i32 %x, 25
+  %add = add i32 %shl, %shr
+  ret i32 %add
+}
+
+define i32 @test_rotr(i32 %x) {
+; CHECK-LABEL: test_rotr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_rotr_param_0];
+; CHECK-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 25;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %shr = lshr i32 %x, 7
+  %shl = shl i32 %x, 25
+  %add = add i32 %shr, %shl
+  ret i32 %add
+}
+
+define i32 @test_rotl_var(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotl_var(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_rotl_var_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_rotl_var_param_1];
+; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    sub.s32 %r4, 32, %r2;
+; CHECK-NEXT:    shr.u32 %r5, %r1, %r4;
+; CHECK-NEXT:    add.s32 %r6, %r3, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %shl = shl i32 %x, %y
+  %sub = sub i32 32, %y
+  %shr = lshr i32 %x, %sub
+  %add = add i32 %shl, %shr
+  ret i32 %add
+}
+
+define i32 @test_rotr_var(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotr_var(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_rotr_var_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_rotr_var_param_1];
+; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    sub.s32 %r4, 32, %r2;
+; CHECK-NEXT:    shl.b32 %r5, %r1, %r4;
+; CHECK-NEXT:    add.s32 %r6, %r3, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %shr = lshr i32 %x, %y
+  %sub = sub i32 32, %y
+  %shl = shl i32 %x, %sub
+  %add = add i32 %shr, %shl
+  ret i32 %add
+}
+
+define i32 @test_rotl_var_and(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotl_var_and(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_rotl_var_and_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_rotl_var_and_param_1];
+; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    neg.s32 %r4, %r2;
+; CHECK-NEXT:    and.b32 %r5, %r4, 31;
+; CHECK-NEXT:    shr.u32 %r6, %r1, %r5;
+; CHECK-NEXT:    add.s32 %r7, %r6, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
+  %shr = shl i32 %x, %y
+  %sub = sub nsw i32 0, %y
+  %and = and i32 %sub, 31
+  %shl = lshr i32 %x, %and
+  %add = add i32 %shl, %shr
+  ret i32 %add
+}
+
+define i32 @test_rotr_var_and(i32 %x, i32 %y) {
+; CHECK-LABEL: test_rotr_var_and(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_rotr_var_and_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_rotr_var_and_param_1];
+; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    neg.s32 %r4, %r2;
+; CHECK-NEXT:    and.b32 %r5, %r4, 31;
+; CHECK-NEXT:    shl.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    add.s32 %r7, %r3, %r6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
+  %shr = lshr i32 %x, %y
+  %sub = sub nsw i32 0, %y
+  %and = and i32 %sub, 31
+  %shl = shl i32 %x, %and
+  %add = add i32 %shr, %shl
+  ret i32 %add
+}
+
+define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
+; CHECK-LABEL: test_fshl_special_case(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_fshl_special_case_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_fshl_special_case_param_2];
+; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    ld.param.u32 %r4, [test_fshl_special_case_param_1];
+; CHECK-NEXT:    shr.u32 %r5, %r4, 1;
+; CHECK-NEXT:    xor.b32 %r6, %r2, 31;
+; CHECK-NEXT:    shr.u32 %r7, %r5, %r6;
+; CHECK-NEXT:    add.s32 %r8, %r3, %r7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
+  %shl = shl i32 %x0, %y
+  %srli = lshr i32 %x1, 1
+  %x = xor i32 %y, 31
+  %srlo = lshr i32 %srli, %x
+  %o = add i32 %shl, %srlo
+  ret i32 %o
+}
+
+define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
+; CHECK-LABEL: test_fshr_special_case(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_fshr_special_case_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [test_fshr_special_case_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test_fshr_special_case_param_2];
+; CHECK-NEXT:    shr.u32 %r4, %r2, %r3;
+; CHECK-NEXT:    shl.b32 %r5, %r1, 1;
+; CHECK-NEXT:    xor.b32 %r6, %r3, 31;
+; CHECK-NEXT:    shl.b32 %r7, %r5, %r6;
+; CHECK-NEXT:    add.s32 %r8, %r4, %r7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
+  %shl = lshr i32 %x1, %y
+  %srli = shl i32 %x0, 1
+  %x = xor i32 %y, 31
+  %srlo = shl i32 %srli, %x
+  %o = add i32 %shl, %srlo
+  ret i32 %o
+}

>From e128108a9f4fc1b85b52f7a79138f1ae62b092c8 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 4 Feb 2025 17:39:29 +0000
Subject: [PATCH 2/4] [DAGCombiner] Attempt to fold 'add' nodes to funnel-shift
 or rotate

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 86 +++++++++----------
 llvm/test/CodeGen/NVPTX/add-rotate.ll         | 42 +++------
 2 files changed, 57 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f4caaf426de6a..793a174ebb1ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -662,14 +662,15 @@ namespace {
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
     SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
-                              SDValue InnerPos, SDValue InnerNeg, bool HasPos,
-                              unsigned PosOpcode, unsigned NegOpcode,
-                              const SDLoc &DL);
+                              SDValue InnerPos, SDValue InnerNeg, bool FromAdd,
+                              bool HasPos, unsigned PosOpcode,
+                              unsigned NegOpcode, const SDLoc &DL);
     SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
-                              SDValue InnerPos, SDValue InnerNeg, bool HasPos,
-                              unsigned PosOpcode, unsigned NegOpcode,
-                              const SDLoc &DL);
-    SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+                              SDValue InnerPos, SDValue InnerNeg, bool FromAdd,
+                              bool HasPos, unsigned PosOpcode,
+                              unsigned NegOpcode, const SDLoc &DL);
+    SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
+                        bool FromAdd);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue mergeTruncStores(StoreSDNode *N);
     SDValue reduceLoadWidth(SDNode *N);
@@ -2992,6 +2993,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (SDValue V = foldAddSubOfSignBit(N, DL, DAG))
     return V;
 
+  if (SDValue V = MatchRotate(N0, N1, SDLoc(N), /*FromAdd=*/true))
+    return V;
+
   // Try to match AVGFLOOR fixedwidth pattern
   if (SDValue V = foldAddToAvg(N, DL))
     return V;
@@ -8161,7 +8165,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       return V;
 
   // See if this is some rotate idiom.
-  if (SDValue Rot = MatchRotate(N0, N1, DL))
+  if (SDValue Rot = MatchRotate(N0, N1, DL, /*FromAdd=*/false))
     return Rot;
 
   if (SDValue Load = MatchLoadCombine(N))
@@ -8350,7 +8354,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
 // The IsRotate flag should be set when the LHS of both shifts is the same.
 // Otherwise if matching a general funnel shift, it should be clear.
 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
-                           SelectionDAG &DAG, bool IsRotate) {
+                           SelectionDAG &DAG, bool IsRotate, bool FromAdd) {
   const auto &TLI = DAG.getTargetLoweringInfo();
   // If EltSize is a power of 2 then:
   //
@@ -8389,7 +8393,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
   // NOTE: We can only do this when matching operations which won't modify the
   // least Log2(EltSize) significant bits and not a general funnel shift.
   unsigned MaskLoBits = 0;
-  if (IsRotate && isPowerOf2_64(EltSize)) {
+  if (IsRotate && !FromAdd && isPowerOf2_64(EltSize)) {
     unsigned Bits = Log2_64(EltSize);
     unsigned NegBits = Neg.getScalarValueSizeInBits();
     if (NegBits >= Bits) {
@@ -8472,9 +8476,9 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
 // Neg with outer conversions stripped away.
 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
-                                       SDValue InnerNeg, bool HasPos,
-                                       unsigned PosOpcode, unsigned NegOpcode,
-                                       const SDLoc &DL) {
+                                       SDValue InnerNeg, bool FromAdd,
+                                       bool HasPos, unsigned PosOpcode,
+                                       unsigned NegOpcode, const SDLoc &DL) {
   // fold (or (shl x, (*ext y)),
   //          (srl x, (*ext (sub 32, y)))) ->
   //   (rotl x, y) or (rotr x, (sub 32, y))
@@ -8484,10 +8488,9 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
   if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
-                     /*IsRotate*/ true)) {
+                     /*IsRotate*/ true, FromAdd))
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg);
-  }
 
   return SDValue();
 }
@@ -8500,9 +8503,9 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
 // TODO: Merge with MatchRotatePosNeg.
 SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
-                                       SDValue InnerNeg, bool HasPos,
-                                       unsigned PosOpcode, unsigned NegOpcode,
-                                       const SDLoc &DL) {
+                                       SDValue InnerNeg, bool FromAdd,
+                                       bool HasPos, unsigned PosOpcode,
+                                       unsigned NegOpcode, const SDLoc &DL) {
   EVT VT = N0.getValueType();
   unsigned EltBits = VT.getScalarSizeInBits();
 
@@ -8513,10 +8516,10 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
   // fold (or (shl x0, (*ext (sub 32, y))),
   //          (srl x1, (*ext y))) ->
   //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
-  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
+  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1,
+                     FromAdd))
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
                        HasPos ? Pos : Neg);
-  }
 
   // Matching the shift+xor cases, we can't easily use the xor'd shift amount
   // so for now just use the PosOpcode case if its legal.
@@ -8561,11 +8564,12 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
   return SDValue();
 }
 
-// MatchRotate - Handle an 'or' of two operands.  If this is one of the many
-// idioms for rotate, and if the target supports rotation instructions, generate
-// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
-// with different shifted sources.
-SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
+// MatchRotate - Handle an 'or' or 'add' of two operands.  If this is one of the
+// many idioms for rotate, and if the target supports rotation instructions,
+// generate a rot[lr]. This also matches funnel shift patterns, similar to
+// rotation but with different shifted sources.
+SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
+                                 bool FromAdd) {
   EVT VT = LHS.getValueType();
 
   // The target must have at least one rotate/funnel flavor.
@@ -8592,9 +8596,9 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
       LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
     assert(LHS.getValueType() == RHS.getValueType());
-    if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
+    if (SDValue Rot =
+            MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL, FromAdd))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
-    }
   }
 
   // Match "(X shl/srl V1) & V2" where V2 may not be present.
@@ -8774,29 +8778,25 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   }
 
   if (IsRotate && (HasROTL || HasROTR)) {
-    SDValue TryL =
-        MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
-                          RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
-    if (TryL)
+    if (SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+                                         LExtOp0, RExtOp0, FromAdd, HasROTL,
+                                         ISD::ROTL, ISD::ROTR, DL))
       return TryL;
 
-    SDValue TryR =
-        MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
-                          LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
-    if (TryR)
+    if (SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+                                         RExtOp0, LExtOp0, FromAdd, HasROTR,
+                                         ISD::ROTR, ISD::ROTL, DL))
       return TryR;
   }
 
-  SDValue TryL =
-      MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
-                        LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
-  if (TryL)
+  if (SDValue TryL = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt,
+                                       RHSShiftAmt, LExtOp0, RExtOp0, FromAdd,
+                                       HasFSHL, ISD::FSHL, ISD::FSHR, DL))
     return TryL;
 
-  SDValue TryR =
-      MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
-                        RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
-  if (TryR)
+  if (SDValue TryR = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt,
+                                       LHSShiftAmt, RExtOp0, LExtOp0, FromAdd,
+                                       HasFSHR, ISD::FSHR, ISD::FSHL, DL))
     return TryR;
 
   return SDValue();
diff --git a/llvm/test/CodeGen/NVPTX/add-rotate.ll b/llvm/test/CodeGen/NVPTX/add-rotate.ll
index b7430f284ddcb..b25d4a2bd8001 100644
--- a/llvm/test/CodeGen/NVPTX/add-rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/add-rotate.ll
@@ -38,16 +38,13 @@ define i32 @test_rotr(i32 %x) {
 define i32 @test_rotl_var(i32 %x, i32 %y) {
 ; CHECK-LABEL: test_rotl_var(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_rotl_var_param_0];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_rotl_var_param_1];
-; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    sub.s32 %r4, 32, %r2;
-; CHECK-NEXT:    shr.u32 %r5, %r1, %r4;
-; CHECK-NEXT:    add.s32 %r6, %r3, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r1, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %shl = shl i32 %x, %y
   %sub = sub i32 32, %y
@@ -59,16 +56,13 @@ define i32 @test_rotl_var(i32 %x, i32 %y) {
 define i32 @test_rotr_var(i32 %x, i32 %y) {
 ; CHECK-LABEL: test_rotr_var(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_rotr_var_param_0];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_rotr_var_param_1];
-; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
-; CHECK-NEXT:    sub.s32 %r4, 32, %r2;
-; CHECK-NEXT:    shl.b32 %r5, %r1, %r4;
-; CHECK-NEXT:    add.s32 %r6, %r3, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    shf.r.wrap.b32 %r3, %r1, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %shr = lshr i32 %x, %y
   %sub = sub i32 32, %y
@@ -126,18 +120,14 @@ define i32 @test_rotr_var_and(i32 %x, i32 %y) {
 define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
 ; CHECK-LABEL: test_fshl_special_case(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_fshl_special_case_param_0];
-; CHECK-NEXT:    ld.param.u32 %r2, [test_fshl_special_case_param_2];
-; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.u32 %r4, [test_fshl_special_case_param_1];
-; CHECK-NEXT:    shr.u32 %r5, %r4, 1;
-; CHECK-NEXT:    xor.b32 %r6, %r2, 31;
-; CHECK-NEXT:    shr.u32 %r7, %r5, %r6;
-; CHECK-NEXT:    add.s32 %r8, %r3, %r7;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_fshl_special_case_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [test_fshl_special_case_param_2];
+; CHECK-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %shl = shl i32 %x0, %y
   %srli = lshr i32 %x1, 1
@@ -150,18 +140,14 @@ define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
 define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
 ; CHECK-LABEL: test_fshr_special_case(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_fshr_special_case_param_0];
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_fshr_special_case_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r3, [test_fshr_special_case_param_2];
-; CHECK-NEXT:    shr.u32 %r4, %r2, %r3;
-; CHECK-NEXT:    shl.b32 %r5, %r1, 1;
-; CHECK-NEXT:    xor.b32 %r6, %r3, 31;
-; CHECK-NEXT:    shl.b32 %r7, %r5, %r6;
-; CHECK-NEXT:    add.s32 %r8, %r4, %r7;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NEXT:    ret;
   %shl = lshr i32 %x1, %y
   %srli = shl i32 %x0, 1

>From cfc06a6b046f5a28b1080df7fc0299a238cbea55 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 14 Feb 2025 01:44:45 +0000
Subject: [PATCH 3/4] address comments

---
 llvm/test/CodeGen/NVPTX/add-rotate.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/add-rotate.ll b/llvm/test/CodeGen/NVPTX/add-rotate.ll
index b25d4a2bd8001..091ba7bcba32a 100644
--- a/llvm/test/CodeGen/NVPTX/add-rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/add-rotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: llc < %s -mcpu=sm_50 | FileCheck %s
 
 target triple = "nvptx64-nvidia-cuda"
 

>From 982d35bcf7091fd26719919eaee87f9f5625a405 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 19 Feb 2025 21:01:25 +0000
Subject: [PATCH 4/4] address comment comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 793a174ebb1ea..137047f3521e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8479,12 +8479,12 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue InnerNeg, bool FromAdd,
                                        bool HasPos, unsigned PosOpcode,
                                        unsigned NegOpcode, const SDLoc &DL) {
-  // fold (or (shl x, (*ext y)),
-  //          (srl x, (*ext (sub 32, y)))) ->
+  // fold (or/add (shl x, (*ext y)),
+  //              (srl x, (*ext (sub 32, y)))) ->
   //   (rotl x, y) or (rotr x, (sub 32, y))
   //
-  // fold (or (shl x, (*ext (sub 32, y))),
-  //          (srl x, (*ext y))) ->
+  // fold (or/add (shl x, (*ext (sub 32, y))),
+  //              (srl x, (*ext y))) ->
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
   if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
@@ -8509,12 +8509,12 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
   EVT VT = N0.getValueType();
   unsigned EltBits = VT.getScalarSizeInBits();
 
-  // fold (or (shl x0, (*ext y)),
-  //          (srl x1, (*ext (sub 32, y)))) ->
+  // fold (or/add (shl x0, (*ext y)),
+  //              (srl x1, (*ext (sub 32, y)))) ->
   //   (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
   //
-  // fold (or (shl x0, (*ext (sub 32, y))),
-  //          (srl x1, (*ext y))) ->
+  // fold (or/add (shl x0, (*ext (sub 32, y))),
+  //              (srl x1, (*ext y))) ->
   //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
   if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1,
                      FromAdd))
@@ -8532,7 +8532,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
       return Cst && (Cst->getAPIntValue() == Imm);
     };
 
-    // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
+    // fold (or/add (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
     //   -> (fshl x0, x1, y)
     if (IsBinOpImm(N1, ISD::SRL, 1) &&
         IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
@@ -8541,7 +8541,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
       return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
     }
 
-    // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
+    // fold (or/add (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
     //   -> (fshr x0, x1, y)
     if (IsBinOpImm(N0, ISD::SHL, 1) &&
         IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
@@ -8550,7 +8550,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
       return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
     }
 
-    // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
+    // fold (or/add (shl (add x0, x0), (xor y, 31)), (srl x1, y))
     //   -> (fshr x0, x1, y)
     // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
     if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
@@ -8732,10 +8732,10 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL,
     return SDValue(); // Requires funnel shift support.
   }
 
-  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
-  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
-  // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
-  // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
+  // fold (or/add (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or/add (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  // fold (or/add (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
+  // fold (or/add (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
   // iff C1+C2 == EltSizeInBits
   if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
     SDValue Res;