[llvm] [NVPTX] Improve 64bit FSH/ROT lowering when shift amount is constant (PR #131371)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 13:07:43 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/131371
>From 57b9c8108136867222211974e3d2acb17b721a10 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 14 Mar 2025 17:08:55 +0000
Subject: [PATCH] [NVPTX] Improve 64-bit FSH/ROT lowering when the shift
ammount is a constant
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 14 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 61 +++-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 2 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 6 +
llvm/test/CodeGen/NVPTX/rotate.ll | 369 ++++++++++++++++++--
llvm/test/CodeGen/NVPTX/rotate_64.ll | 26 +-
7 files changed, 432 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4ce8c508c5f2b..f2757c5e49b33 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -113,6 +113,9 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryFence(N))
return;
break;
+ case NVPTXISD::UNPACK_VECTOR:
+ tryUNPACK_VECTOR(N);
+ return;
case ISD::EXTRACT_VECTOR_ELT:
if (tryEXTRACT_VECTOR_ELEMENT(N))
return;
@@ -445,6 +448,17 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
return true;
}
+bool NVPTXDAGToDAGISel::tryUNPACK_VECTOR(SDNode *N) {
+ SDValue Vector = N->getOperand(0);
+ MVT EltVT = N->getSimpleValueType(0);
+
+ MachineSDNode *N2 =
+ CurDAG->getMachineNode(NVPTX::I64toV2I32, SDLoc(N), EltVT, EltVT, Vector);
+
+ ReplaceNode(N, N2);
+ return true;
+}
+
// Find all instances of extract_vector_elt that use this v2f16 vector
// and coalesce them into a scattering move instruction.
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 42891b8ca8d8d..23cbd458571a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -88,6 +88,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryConstantFP(SDNode *N);
bool SelectSETP_F16X2(SDNode *N);
bool SelectSETP_BF16X2(SDNode *N);
+ bool tryUNPACK_VECTOR(SDNode *N);
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b768725b04256..d44ba72ff98c9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -66,6 +66,7 @@
#include <iterator>
#include <optional>
#include <string>
+#include <tuple>
#include <utility>
#include <vector>
@@ -668,8 +669,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
Expand);
- if (STI.hasHWROT32())
+ if (STI.hasHWROT32()) {
setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal);
+ setOperationAction({ISD::ROTL, ISD::ROTR, ISD::FSHL, ISD::FSHR}, MVT::i64,
+ Custom);
+ }
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
@@ -1056,6 +1060,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::StoreRetvalV2)
MAKE_CASE(NVPTXISD::StoreRetvalV4)
MAKE_CASE(NVPTXISD::PseudoUseParam)
+ MAKE_CASE(NVPTXISD::UNPACK_VECTOR)
+ MAKE_CASE(NVPTXISD::BUILD_VECTOR)
MAKE_CASE(NVPTXISD::RETURN)
MAKE_CASE(NVPTXISD::CallSeqBegin)
MAKE_CASE(NVPTXISD::CallSeqEnd)
@@ -2758,6 +2764,53 @@ static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
}
+static SDValue expandFSH64(SDValue A, SDValue B, SDValue AmtVal, SDLoc DL,
+ unsigned Opcode, SelectionDAG &DAG) {
+ assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
+
+ const auto *AmtConst = dyn_cast<ConstantSDNode>(AmtVal);
+ if (!AmtConst)
+ return SDValue();
+ const auto Amt = AmtConst->getZExtValue() & 63;
+
+ SDValue UnpackA =
+ DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
+ SDValue UnpackB =
+ DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
+
+ // Arch is Little endiain: 0 = low bits, 1 = high bits
+ SDValue ALo = UnpackA.getValue(0);
+ SDValue AHi = UnpackA.getValue(1);
+ SDValue BLo = UnpackB.getValue(0);
+ SDValue BHi = UnpackB.getValue(1);
+
+ // The bitfeild consists of { AHi : ALo : BHi : BLo }
+ // FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
+ // FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
+ // FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
+ // FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
+ auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
+ ? std::make_tuple(AHi, ALo, BHi)
+ : std::make_tuple(ALo, BHi, BLo);
+
+ SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
+ SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
+ SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
+
+ return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
+}
+
+static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG) {
+ return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
+ SDLoc(Op), Op->getOpcode(), DAG);
+}
+
+static SDValue lowerROT(SDValue Op, SelectionDAG &DAG) {
+ unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
+ return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
+ SDLoc(Op), Opcode, DAG);
+}
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2818,6 +2871,12 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerVAARG(Op, DAG);
case ISD::VASTART:
return LowerVASTART(Op, DAG);
+ case ISD::FSHL:
+ case ISD::FSHR:
+ return lowerFSH(Op, DAG);
+ case ISD::ROTL:
+ case ISD::ROTR:
+ return lowerROT(Op, DAG);
case ISD::ABS:
case ISD::SMIN:
case ISD::SMAX:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index ff0241886223b..152fe253eeed9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -61,6 +61,8 @@ enum NodeType : unsigned {
BFE,
BFI,
PRMT,
+ UNPACK_VECTOR,
+ BUILD_VECTOR,
FCOPYSIGN,
DYNAMIC_STACKALLOC,
STACKRESTORE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3c88551d7b23c..83509b1078c57 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3222,6 +3222,12 @@ def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
(CVT_u32_u16 $a, CvtNONE)>;
+
+def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>;
+
+def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)),
+ (V2I32toI64 $a, $b)>;
+
//
// Funnel-Shift
//
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 4174fd2f3ec2c..f77fb4115567b 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -15,8 +15,6 @@ declare i32 @llvm.fshl.i32(i32, i32, i32)
declare i32 @llvm.fshr.i32(i32, i32, i32)
-; SM20: rotate32
-; SM35: rotate32
define i32 @rotate32(i32 %a, i32 %b) {
; SM20-LABEL: rotate32(
; SM20: {
@@ -48,8 +46,6 @@ define i32 @rotate32(i32 %a, i32 %b) {
ret i32 %val
}
-; SM20: rotate64
-; SM35: rotate64
define i64 @rotate64(i64 %a, i32 %b) {
; SM20-LABEL: rotate64(
; SM20: {
@@ -88,8 +84,6 @@ define i64 @rotate64(i64 %a, i32 %b) {
ret i64 %val
}
-; SM20: rotateright64
-; SM35: rotateright64
define i64 @rotateright64(i64 %a, i32 %b) {
; SM20-LABEL: rotateright64(
; SM20: {
@@ -128,8 +122,6 @@ define i64 @rotateright64(i64 %a, i32 %b) {
ret i64 %val
}
-; SM20: rotl0
-; SM35: rotl0
define i32 @rotl0(i32 %x) {
; SM20-LABEL: rotl0(
; SM20: {
@@ -158,7 +150,6 @@ define i32 @rotl0(i32 %x) {
ret i32 %t2
}
-; SM35: rotl64
define i64 @rotl64(i64 %a, i64 %n) {
; SM20-LABEL: rotl64(
; SM20: {
@@ -197,36 +188,94 @@ define i64 @rotl64(i64 %a, i64 %n) {
ret i64 %val
}
-; SM35: rotl64_imm
-define i64 @rotl64_imm(i64 %a) {
-; SM20-LABEL: rotl64_imm(
+define i64 @rotl64_low_imm(i64 %a) {
+; SM20-LABEL: rotl64_low_imm(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0];
; SM20-NEXT: shr.u64 %rd2, %rd1, 62;
; SM20-NEXT: shl.b64 %rd3, %rd1, 2;
; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
; SM20-NEXT: ret;
;
-; SM35-LABEL: rotl64_imm(
+; SM35-LABEL: rotl64_low_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
-; SM35-NEXT: shr.u64 %rd2, %rd1, 62;
-; SM35-NEXT: shl.b64 %rd3, %rd1, 2;
-; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM35-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 2;
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 2;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}
-; SM35: rotr64
+define i64 @rotl64_high_imm(i64 %a) {
+; SM20-LABEL: rotl64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0];
+; SM20-NEXT: shr.u64 %rd2, %rd1, 1;
+; SM20-NEXT: shl.b64 %rd3, %rd1, 63;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.l.wrap.b32 %r3, %r2, %r1, 31;
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, 31;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
+ ret i64 %val
+}
+
+define i64 @rotl64_32_imm(i64 %a) {
+; SM20-LABEL: rotl64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0];
+; SM20-NEXT: shr.u64 %rd2, %rd1, 32;
+; SM20-NEXT: shl.b64 %rd3, %rd1, 32;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<3>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: mov.b64 %rd2, {%r2, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 32)
+ ret i64 %val
+}
+
define i64 @rotr64(i64 %a, i64 %n) {
; SM20-LABEL: rotr64(
; SM20: {
@@ -265,32 +314,91 @@ define i64 @rotr64(i64 %a, i64 %n) {
ret i64 %val
}
-; SM35: rotr64_imm
-define i64 @rotr64_imm(i64 %a) {
-; SM20-LABEL: rotr64_imm(
+define i64 @rotr64_low_imm(i64 %a) {
+; SM20-LABEL: rotr64_low_imm(
; SM20: {
; SM20-NEXT: .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM20-NEXT: shl.b64 %rd2, %rd1, 62;
-; SM20-NEXT: shr.u64 %rd3, %rd1, 2;
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 52;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 12;
; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
; SM20-NEXT: ret;
;
-; SM35-LABEL: rotr64_imm(
+; SM35-LABEL: rotr64_low_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM35-NEXT: shl.b64 %rd2, %rd1, 62;
-; SM35-NEXT: shr.u64 %rd3, %rd1, 2;
-; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM35-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 12;
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 12;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
; SM35-NEXT: ret;
- %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 12)
+ ret i64 %val
+}
+
+define i64 @rotr64_high_imm(i64 %a) {
+; SM20-LABEL: rotr64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 21;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 43;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotr64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: shf.r.wrap.b32 %r3, %r1, %r2, 11;
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, 11;
+; SM35-NEXT: mov.b64 %rd2, {%r4, %r3};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 43)
+ ret i64 %val
+}
+
+define i64 @rotr64_32_imm(i64 %a) {
+; SM20-LABEL: rotr64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 32;
+; SM20-NEXT: shr.u64 %rd3, %rd1, 32;
+; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotr64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<3>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: mov.b64 %rd2, {%r2, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 32)
ret i64 %val
}
@@ -446,3 +554,194 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) {
ret i64 %val
}
+define i64 @fshl64_low_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshl64_low_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 59;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 5;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshl64_low_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 5;
+; SM35-NEXT: shf.l.wrap.b32 %r6, %r1, %r2, 5;
+; SM35-NEXT: mov.b64 %rd3, {%r5, %r6};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 5)
+ ret i64 %val
+}
+
+define i64 @fshl64_high_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshl64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 9;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 55;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshl64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 23;
+; SM35-NEXT: shf.l.wrap.b32 %r6, %r3, %r4, 23;
+; SM35-NEXT: mov.b64 %rd3, {%r6, %r5};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 55)
+ ret i64 %val
+}
+
+define i64 @fshl64_32_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshl64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 32;
+; SM20-NEXT: ld.param.u32 %rd3, [fshl64_32_imm_param_1+4];
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshl64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshl64_32_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: mov.b64 %rd3, {%r4, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 32)
+ ret i64 %val
+}
+
+define i64 @fshr64_low_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshr64_low_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 31;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 33;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshr64_low_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 31;
+; SM35-NEXT: shf.r.wrap.b32 %r6, %r3, %r4, 31;
+; SM35-NEXT: mov.b64 %rd3, {%r6, %r5};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 31)
+ ret i64 %val
+}
+
+define i64 @fshr64_high_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshr64_high_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<6>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0];
+; SM20-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1];
+; SM20-NEXT: shr.u64 %rd3, %rd2, 33;
+; SM20-NEXT: shl.b64 %rd4, %rd1, 31;
+; SM20-NEXT: or.b64 %rd5, %rd4, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd5;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshr64_high_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<7>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 1;
+; SM35-NEXT: shf.r.wrap.b32 %r6, %r1, %r2, 1;
+; SM35-NEXT: mov.b64 %rd3, {%r5, %r6};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 33)
+ ret i64 %val
+}
+
+define i64 @fshr64_32_imm(i64 %a, i64 %b) {
+; SM20-LABEL: fshr64_32_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0];
+; SM20-NEXT: shl.b64 %rd2, %rd1, 32;
+; SM20-NEXT: ld.param.u32 %rd3, [fshr64_32_imm_param_1+4];
+; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: fshr64_32_imm(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<5>;
+; SM35-NEXT: .reg .b64 %rd<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0];
+; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; SM35-NEXT: ld.param.u64 %rd2, [fshr64_32_imm_param_1];
+; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2;
+; SM35-NEXT: mov.b64 %rd3, {%r4, %r1};
+; SM35-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM35-NEXT: ret;
+ %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 32)
+ ret i64 %val
+}
diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll
index aa0d8efc0c700..841dc67c68640 100644
--- a/llvm/test/CodeGen/NVPTX/rotate_64.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=nvptx64 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 | %ptxas-verify %}
declare i64 @llvm.nvvm.rotate.b64(i64, i32)
@@ -8,14 +8,16 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
define i64 @rotate64(i64 %a, i32 %b) {
; CHECK-LABEL: rotate64(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
-; CHECK-NEXT: shr.u64 %rd2, %rd1, 61;
-; CHECK-NEXT: shl.b64 %rd3, %rd1, 3;
-; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 3;
+; CHECK-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 3;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
; CHECK-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3)
ret i64 %val
@@ -24,14 +26,16 @@ define i64 @rotate64(i64 %a, i32 %b) {
define i64 @rotateright64(i64 %a, i32 %b) {
; CHECK-LABEL: rotateright64(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
-; CHECK-NEXT: shl.b64 %rd2, %rd1, 61;
-; CHECK-NEXT: shr.u64 %rd3, %rd1, 3;
-; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 3;
+; CHECK-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 3;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
; CHECK-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3)
ret i64 %val
More information about the llvm-commits
mailing list