[llvm] 4cb61c2 - Revert "[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling (#107655)"
Dmitry Chernenkov via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 25 07:51:11 PDT 2024
Author: Dmitry Chernenkov
Date: 2024-09-25T14:50:26Z
New Revision: 4cb61c20ef38c6020389a15e739bac929b15425a
URL: https://github.com/llvm/llvm-project/commit/4cb61c20ef38c6020389a15e739bac929b15425a
DIFF: https://github.com/llvm/llvm-project/commit/4cb61c20ef38c6020389a15e739bac929b15425a.diff
LOG: Revert "[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling (#107655)"
This reverts commit 9ac00b85e05d21be658d6aa0c91cbe05bb5dbde2.
Added:
Modified:
llvm/docs/ReleaseNotes.rst
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/IR/AutoUpgrade.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
llvm/test/CodeGen/NVPTX/rotate.ll
llvm/test/CodeGen/NVPTX/rotate_64.ll
Removed:
################################################################################
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 0784d93f18da8f..05f5bd65fc5f6d 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -63,12 +63,6 @@ Changes to the LLVM IR
* ``llvm.nvvm.bitcast.d2ll``
* ``llvm.nvvm.bitcast.ll2d``
-* Remove the following intrinsics which can be replaced with a funnel-shift:
-
- * ``llvm.nvvm.rotate.b32``
- * ``llvm.nvvm.rotate.right.b64``
- * ``llvm.nvvm.rotate.b64``
-
Changes to LLVM infrastructure
------------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aa5294f5f9c909..737dd6092e2183 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4479,6 +4479,22 @@ def int_nvvm_sust_p_3d_v4i32_trap
"llvm.nvvm.sust.p.3d.v4i32.trap">,
ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
+
+def int_nvvm_rotate_b32
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">,
+ ClangBuiltin<"__nvvm_rotate_b32">;
+
+def int_nvvm_rotate_b64
+ : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">,
+ ClangBuiltin<"__nvvm_rotate_b64">;
+
+def int_nvvm_rotate_right_b64
+ : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">,
+ ClangBuiltin<"__nvvm_rotate_right_b64">;
+
def int_nvvm_swap_lo_hi_b64
: DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
[IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3390d651d6c693..02d1d9d9f78984 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1272,9 +1272,6 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
// nvvm.bitcast.{f2i,i2f,ll2d,d2ll}
Expand =
Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll";
- else if (Name.consume_front("rotate."))
- // nvvm.rotate.{b32,b64,right.b64}
- Expand = Name == "b32" || Name == "b64" || Name == "right.b64";
else
Expand = false;
@@ -2261,108 +2258,6 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
}
}
-static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
- Function *F, IRBuilder<> &Builder) {
- Value *Rep = nullptr;
-
- if (Name == "abs.i" || Name == "abs.ll") {
- Value *Arg = CI->getArgOperand(0);
- Value *Neg = Builder.CreateNeg(Arg, "neg");
- Value *Cmp = Builder.CreateICmpSGE(
- Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
- Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
- } else if (Name.starts_with("atomic.load.add.f32.p") ||
- Name.starts_with("atomic.load.add.f64.p")) {
- Value *Ptr = CI->getArgOperand(0);
- Value *Val = CI->getArgOperand(1);
- Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
- AtomicOrdering::SequentiallyConsistent);
- } else if (Name.consume_front("max.") &&
- (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
- Name == "ui" || Name == "ull")) {
- Value *Arg0 = CI->getArgOperand(0);
- Value *Arg1 = CI->getArgOperand(1);
- Value *Cmp = Name.starts_with("u")
- ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
- : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
- Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
- } else if (Name.consume_front("min.") &&
- (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
- Name == "ui" || Name == "ull")) {
- Value *Arg0 = CI->getArgOperand(0);
- Value *Arg1 = CI->getArgOperand(1);
- Value *Cmp = Name.starts_with("u")
- ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
- : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
- Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
- } else if (Name == "clz.ll") {
- // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
- Value *Arg = CI->getArgOperand(0);
- Value *Ctlz = Builder.CreateCall(
- Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
- {Arg->getType()}),
- {Arg, Builder.getFalse()}, "ctlz");
- Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
- } else if (Name == "popc.ll") {
- // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
- // i64.
- Value *Arg = CI->getArgOperand(0);
- Value *Popc = Builder.CreateCall(
- Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
- {Arg->getType()}),
- Arg, "ctpop");
- Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
- } else if (Name == "h2f") {
- Rep = Builder.CreateCall(
- Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16,
- {Builder.getFloatTy()}),
- CI->getArgOperand(0), "h2f");
- } else if (Name.consume_front("bitcast.") &&
- (Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
- Name == "d2ll")) {
- Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType());
- } else if (Name == "rotate.b32") {
- Value *Arg = CI->getOperand(0);
- Value *ShiftAmt = CI->getOperand(1);
- Rep = Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::fshl,
- {Arg, Arg, ShiftAmt});
- } else if (Name == "rotate.b64") {
- Type *Int64Ty = Builder.getInt64Ty();
- Value *Arg = CI->getOperand(0);
- Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
- Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl,
- {Arg, Arg, ZExtShiftAmt});
- } else if (Name == "rotate.right.b64") {
- Type *Int64Ty = Builder.getInt64Ty();
- Value *Arg = CI->getOperand(0);
- Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
- Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr,
- {Arg, Arg, ZExtShiftAmt});
- } else {
- Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
- if (IID != Intrinsic::not_intrinsic &&
- !F->getReturnType()->getScalarType()->isBFloatTy()) {
- rename(F);
- Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
- SmallVector<Value *, 2> Args;
- for (size_t I = 0; I < NewFn->arg_size(); ++I) {
- Value *Arg = CI->getArgOperand(I);
- Type *OldType = Arg->getType();
- Type *NewType = NewFn->getArg(I)->getType();
- Args.push_back(
- (OldType->isIntegerTy() && NewType->getScalarType()->isBFloatTy())
- ? Builder.CreateBitCast(Arg, NewType)
- : Arg);
- }
- Rep = Builder.CreateCall(NewFn, Args);
- if (F->getReturnType()->isIntegerTy())
- Rep = Builder.CreateBitCast(Rep, F->getReturnType());
- }
- }
-
- return Rep;
-}
-
static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
IRBuilder<> &Builder) {
LLVMContext &C = F->getContext();
@@ -4313,8 +4208,85 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
if (!IsX86 && Name == "stackprotectorcheck") {
Rep = nullptr;
+ } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
+ Value *Arg = CI->getArgOperand(0);
+ Value *Neg = Builder.CreateNeg(Arg, "neg");
+ Value *Cmp = Builder.CreateICmpSGE(
+ Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
+ Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
+ } else if (IsNVVM && (Name.starts_with("atomic.load.add.f32.p") ||
+ Name.starts_with("atomic.load.add.f64.p"))) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Val = CI->getArgOperand(1);
+ Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
+ AtomicOrdering::SequentiallyConsistent);
+ } else if (IsNVVM && Name.consume_front("max.") &&
+ (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+ Name == "ui" || Name == "ull")) {
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+ Value *Cmp = Name.starts_with("u")
+ ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
+ : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
+ Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
+ } else if (IsNVVM && Name.consume_front("min.") &&
+ (Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
+ Name == "ui" || Name == "ull")) {
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+ Value *Cmp = Name.starts_with("u")
+ ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
+ : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
+ Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
+ } else if (IsNVVM && Name == "clz.ll") {
+ // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
+ Value *Arg = CI->getArgOperand(0);
+ Value *Ctlz = Builder.CreateCall(
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+ {Arg->getType()}),
+ {Arg, Builder.getFalse()}, "ctlz");
+ Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
+ } else if (IsNVVM && Name == "popc.ll") {
+ // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
+ // i64.
+ Value *Arg = CI->getArgOperand(0);
+ Value *Popc = Builder.CreateCall(
+ Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+ {Arg->getType()}),
+ Arg, "ctpop");
+ Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
} else if (IsNVVM) {
- Rep = upgradeNVVMIntrinsicCall(Name, CI, F, Builder);
+ if (Name == "h2f") {
+ Rep =
+ Builder.CreateCall(Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::convert_from_fp16,
+ {Builder.getFloatTy()}),
+ CI->getArgOperand(0), "h2f");
+ } else if (Name.consume_front("bitcast.") &&
+ (Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
+ Name == "d2ll")) {
+ Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType());
+ } else {
+ Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
+ if (IID != Intrinsic::not_intrinsic &&
+ !F->getReturnType()->getScalarType()->isBFloatTy()) {
+ rename(F);
+ NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+ SmallVector<Value *, 2> Args;
+ for (size_t I = 0; I < NewFn->arg_size(); ++I) {
+ Value *Arg = CI->getArgOperand(I);
+ Type *OldType = Arg->getType();
+ Type *NewType = NewFn->getArg(I)->getType();
+ Args.push_back((OldType->isIntegerTy() &&
+ NewType->getScalarType()->isBFloatTy())
+ ? Builder.CreateBitCast(Arg, NewType)
+ : Arg);
+ }
+ Rep = Builder.CreateCall(NewFn, Args);
+ if (F->getReturnType()->isIntegerTy())
+ Rep = Builder.CreateBitCast(Rep, F->getReturnType());
+ }
+ }
} else if (IsX86) {
Rep = upgradeX86IntrinsicCall(Name, CI, F, Builder);
} else if (IsARM) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8812136733fb24..26888342210918 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -594,13 +594,20 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
- setOperationAction({ISD::ROTL, ISD::ROTR},
- {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
- Expand);
-
- if (STI.hasHWROT32())
- setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal);
-
+ // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
+ // that don't have h/w rotation we lower them to multi-instruction assembly.
+ // See ROT*_sw in NVPTXIntrInfo.td
+ setOperationAction(ISD::ROTL, MVT::i64, Legal);
+ setOperationAction(ISD::ROTR, MVT::i64, Legal);
+ setOperationAction(ISD::ROTL, MVT::i32, Legal);
+ setOperationAction(ISD::ROTR, MVT::i32, Legal);
+
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
+ setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
+ setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
+ setOperationAction(ISD::ROTL, MVT::i8, Expand);
+ setOperationAction(ISD::ROTR, MVT::i8, Expand);
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f6bbf4c2ffc02f..510e4b81003119 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1665,6 +1665,167 @@ def BREV64 :
"brev.b64 \t$dst, $a;",
[(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
+//
+// Rotate: Use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+// =>
+// r2 = shf.l r1, r1, n
+def ROTL32imm_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTL32reg_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+// =>
+// r2 = shf.r r1, r1, n
+def ROTR32imm_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
+ Requires<[hasHWROT32]>;
+
+// 32-bit software rotate by immediate. $amt2 should equal 32 - $amt1.
+def ROT32imm_sw :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ "shl.b32 \t%lhs, $src, $amt1;\n\t"
+ "shr.b32 \t%rhs, $src, $amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl (i32 Int32Regs:$src), (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(rotr (i32 Int32Regs:$src), (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+
+// 32-bit software rotate left by register.
+def ROTL32reg_sw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ ".reg .b32 %amt2;\n\t"
+ "shl.b32 \t%lhs, $src, $amt;\n\t"
+ "sub.s32 \t%amt2, 32, $amt;\n\t"
+ "shr.b32 \t%rhs, $src, %amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int32Regs:$dst, (rotl (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
+ Requires<[noHWROT32]>;
+
+// 32-bit software rotate right by register.
+def ROTR32reg_sw :
+ NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b32 %lhs;\n\t"
+ ".reg .b32 %rhs;\n\t"
+ ".reg .b32 %amt2;\n\t"
+ "shr.b32 \t%lhs, $src, $amt;\n\t"
+ "sub.s32 \t%amt2, 32, $amt;\n\t"
+ "shl.b32 \t%rhs, $src, %amt2;\n\t"
+ "add.u32 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int32Regs:$dst, (rotr (i32 Int32Regs:$src), (i32 Int32Regs:$amt)))]>,
+ Requires<[noHWROT32]>;
+
+// 64-bit software rotate by immediate. $amt2 should equal 64 - $amt1.
+def ROT64imm_sw :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ "shl.b64 \t%lhs, $src, $amt1;\n\t"
+ "shr.b64 \t%rhs, $src, $amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+// 64-bit software rotate left by register.
+def ROTL64reg_sw :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ ".reg .u32 %amt2;\n\t"
+ "and.b32 \t%amt2, $amt, 63;\n\t"
+ "shl.b64 \t%lhs, $src, %amt2;\n\t"
+ "sub.u32 \t%amt2, 64, %amt2;\n\t"
+ "shr.b64 \t%rhs, $src, %amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int64Regs:$dst, (rotl Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
+
+def ROTR64reg_sw :
+ NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+ "{{\n\t"
+ ".reg .b64 %lhs;\n\t"
+ ".reg .b64 %rhs;\n\t"
+ ".reg .u32 %amt2;\n\t"
+ "and.b32 \t%amt2, $amt, 63;\n\t"
+ "shr.b64 \t%lhs, $src, %amt2;\n\t"
+ "sub.u32 \t%amt2, 64, %amt2;\n\t"
+ "shl.b64 \t%rhs, $src, %amt2;\n\t"
+ "add.u64 \t$dst, %lhs, %rhs;\n\t"
+ "}}",
+ [(set Int64Regs:$dst, (rotr Int64Regs:$src, (i32 Int32Regs:$amt)))]>;
+
+//
+// Funnnel shift in clamp mode
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFL_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
+
+def FUNSHFRCLAMP :
+ NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+ [(set Int32Regs:$dst,
+ (FUN_SHFR_CLAMP (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>;
//
// BFE - bit-field extract
@@ -3496,42 +3657,6 @@ def : Pat<(v2i16 (build_vector (i16 Int16Regs:$a), (i16 Int16Regs:$b))),
def: Pat<(v2i16 (scalar_to_vector (i16 Int16Regs:$a))),
(CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-//
-// Funnel-Shift
-//
-
-// Create SDNodes so they can be used in the DAG code, e.g.
-// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def fshl_clamp : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def fshr_clamp : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so
-// no side effects.
-let hasSideEffects = false in {
- multiclass ShfInst<string mode, SDNode op> {
- def _i
- : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
- "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt)))]>,
- Requires<[hasHWROT32]>;
-
- def _r
- : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
- [(set Int32Regs:$dst,
- (op (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt)))]>,
- Requires<[hasHWROT32]>;
- }
-
- defm SHF_L_CLAMP : ShfInst<"l.clamp", fshl_clamp>;
- defm SHF_R_CLAMP : ShfInst<"r.clamp", fshr_clamp>;
- defm SHF_L_WRAP : ShfInst<"l.wrap", fshl>;
- defm SHF_R_WRAP : ShfInst<"r.wrap", fshr>;
-}
-
// Count leading zeros
let hasSideEffects = false in {
def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2688cfbe5e824f..56c551661151d7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2733,9 +2733,134 @@ def : Pat<(int_nvvm_read_ptx_sreg_envreg30), (MOV_SPECIAL ENVREG30)>;
def : Pat<(int_nvvm_read_ptx_sreg_envreg31), (MOV_SPECIAL ENVREG31)>;
+// rotate builtin support
+
+def ROTATE_B32_HW_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)))]>,
+ Requires<[hasHWROT32]> ;
+
+def ROTATE_B32_HW_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$src, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+ [(set Int32Regs:$dst,
+ (int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt))]>,
+ Requires<[hasHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, (i32 imm:$amt)),
+ (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+ Requires<[noHWROT32]> ;
+
+def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
+ (ROTL32reg_sw Int32Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]> ;
+
+let hasSideEffects = false in {
+ def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ ".reg .b32 %dummy;\n\t",
+ "mov.b64 \t{$dst,%dummy}, $src;\n\t",
+ "}}"),
+ []> ;
+
+ def GET_HI_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+ !strconcat("{{\n\t",
+ ".reg .b32 %dummy;\n\t",
+ "mov.b64 \t{%dummy,$dst}, $src;\n\t",
+ "}}"),
+ []> ;
+}
+
+let hasSideEffects = false in {
+ def PACK_TWO_INT32
+ : NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$lo, Int32Regs:$hi),
+ "mov.b64 \t$dst, {{$lo, $hi}};", []> ;
+}
+
def : Pat<(int_nvvm_swap_lo_hi_b64 Int64Regs:$src),
- (V2I32toI64 (I64toI32H Int64Regs:$src),
- (I64toI32L Int64Regs:$src))> ;
+ (PACK_TWO_INT32 (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src))> ;
+
+// Funnel shift, requires >= sm_32. Does not trap if amt is out of range, so
+// no side effects.
+let hasSideEffects = false in {
+ def SHF_L_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+ def SHF_L_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.l.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+ def SHF_R_WRAP_B32_IMM
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+
+ def SHF_R_WRAP_B32_REG
+ : NVPTXInst<(outs Int32Regs:$dst),
+ (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+ "shf.r.wrap.b32 \t$dst, $lo, $hi, $amt;",[]>,
+ Requires<[hasHWROT32]>;
+}
+
+// HW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_L_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_L_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_L_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_IMM (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), imm:$amt),
+ (SHF_R_WRAP_B32_IMM (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), imm:$amt))>,
+ Requires<[hasHWROT32]>;
+
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (PACK_TWO_INT32
+ (SHF_R_WRAP_B32_REG (GET_LO_INT64 Int64Regs:$src),
+ (GET_HI_INT64 Int64Regs:$src), Int32Regs:$amt),
+ (SHF_R_WRAP_B32_REG (GET_HI_INT64 Int64Regs:$src),
+ (GET_LO_INT64 Int64Regs:$src), Int32Regs:$amt))>,
+ Requires<[hasHWROT32]>;
+
+// SW version of rotate 64
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTL64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, (i32 imm:$amt)),
+ (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>,
+ Requires<[noHWROT32]>;
+def : Pat<(int_nvvm_rotate_right_b64 Int64Regs:$src, Int32Regs:$amt),
+ (ROTR64reg_sw Int64Regs:$src, Int32Regs:$amt)>,
+ Requires<[noHWROT32]>;
+
//-----------------------------------
// Texture Intrinsics
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 43ac246055da7b..7e4a4d527fc903 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -31,10 +31,6 @@ declare float @llvm.nvvm.bitcast.i2f(i32)
declare i64 @llvm.nvvm.bitcast.d2ll(double)
declare double @llvm.nvvm.bitcast.ll2d(i64)
-declare i32 @llvm.nvvm.rotate.b32(i32, i32)
-declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
-declare i64 @llvm.nvvm.rotate.b64(i64, i32)
-
; CHECK-LABEL: @simple_upgrade
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -143,16 +139,4 @@ define void @bitcast(i32 %a, i64 %b, float %c, double %d) {
%r4 = call double @llvm.nvvm.bitcast.ll2d(i64 %b)
ret void
-}
-
-; CHECK-LABEL: @rotate
-define void @rotate(i32 %a, i64 %b) {
-; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 6)
-; CHECK: call i64 @llvm.fshr.i64(i64 %b, i64 %b, i64 7)
-; CHECK: call i64 @llvm.fshl.i64(i64 %b, i64 %b, i64 8)
-;
- %r1 = call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 6)
- %r2 = call i64 @llvm.nvvm.rotate.right.b64(i64 %b, i32 7)
- %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8)
- ret void
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9ec5bcd13403b9..20c7ae5908d29f 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -9,29 +9,26 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32)
declare i64 @llvm.nvvm.rotate.b64(i64, i32)
declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
-declare i64 @llvm.fshl.i64(i64, i64, i64)
-declare i64 @llvm.fshr.i64(i64, i64, i64)
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-declare i32 @llvm.fshr.i32(i32, i32, i32)
-
-
; SM20: rotate32
; SM35: rotate32
define i32 @rotate32(i32 %a, i32 %b) {
; SM20-LABEL: rotate32(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<9>;
+; SM20-NEXT: .reg .b32 %r<4>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
-; SM20-NEXT: and.b32 %r3, %r2, 31;
-; SM20-NEXT: shl.b32 %r4, %r1, %r3;
-; SM20-NEXT: neg.s32 %r5, %r2;
-; SM20-NEXT: and.b32 %r6, %r5, 31;
-; SM20-NEXT: shr.u32 %r7, %r1, %r6;
-; SM20-NEXT: or.b32 %r8, %r4, %r7;
-; SM20-NEXT: st.param.b32 [func_retval0+0], %r8;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b32 %lhs;
+; SM20-NEXT: .reg .b32 %rhs;
+; SM20-NEXT: .reg .b32 %amt2;
+; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
+; SM20-NEXT: sub.s32 %amt2, 32, %r2;
+; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
+; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotate32(
@@ -53,36 +50,45 @@ define i32 @rotate32(i32 %a, i32 %b) {
define i64 @rotate64(i64 %a, i32 %b) {
; SM20-LABEL: rotate64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
-; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
-; SM20-NEXT: and.b32 %r2, %r1, 63;
-; SM20-NEXT: shl.b64 %rd2, %rd1, %r2;
-; SM20-NEXT: neg.s32 %r3, %r1;
-; SM20-NEXT: and.b32 %r4, %r3, 63;
-; SM20-NEXT: shr.u64 %rd3, %rd1, %r4;
-; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotate64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<6>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
-; SM35-NEXT: ld.param.u32 %r1, [rotate64_param_1];
-; SM35-NEXT: and.b32 %r2, %r1, 63;
-; SM35-NEXT: shl.b64 %rd2, %rd1, %r2;
-; SM35-NEXT: neg.s32 %r3, %r1;
-; SM35-NEXT: and.b32 %r4, %r3, 63;
-; SM35-NEXT: shr.u64 %rd3, %rd1, %r4;
-; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1];
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
ret i64 %val
@@ -93,36 +99,45 @@ define i64 @rotate64(i64 %a, i32 %b) {
define i64 @rotateright64(i64 %a, i32 %b) {
; SM20-LABEL: rotateright64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
-; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
-; SM20-NEXT: and.b32 %r2, %r1, 63;
-; SM20-NEXT: shr.u64 %rd2, %rd1, %r2;
-; SM20-NEXT: neg.s32 %r3, %r1;
-; SM20-NEXT: and.b32 %r4, %r3, 63;
-; SM20-NEXT: shl.b64 %rd3, %rd1, %r4;
-; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotateright64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<6>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
-; SM35-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
-; SM35-NEXT: and.b32 %r2, %r1, 63;
-; SM35-NEXT: shr.u64 %rd2, %rd1, %r2;
-; SM35-NEXT: neg.s32 %r3, %r1;
-; SM35-NEXT: and.b32 %r4, %r3, 63;
-; SM35-NEXT: shl.b64 %rd3, %rd1, %r4;
-; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1];
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
ret i64 %val
@@ -133,14 +148,18 @@ define i64 @rotateright64(i64 %a, i32 %b) {
define i32 @rotl0(i32 %x) {
; SM20-LABEL: rotl0(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
+; SM20-NEXT: .reg .b32 %r<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0];
-; SM20-NEXT: shr.u32 %r2, %r1, 24;
-; SM20-NEXT: shl.b32 %r3, %r1, 8;
-; SM20-NEXT: or.b32 %r4, %r3, %r2;
-; SM20-NEXT: st.param.b32 [func_retval0+0], %r4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b32 %lhs;
+; SM20-NEXT: .reg .b32 %rhs;
+; SM20-NEXT: shl.b32 %lhs, %r1, 8;
+; SM20-NEXT: shr.b32 %rhs, %r1, 24;
+; SM20-NEXT: add.u32 %r2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl0(
@@ -158,40 +177,51 @@ define i32 @rotl0(i32 %x) {
ret i32 %t2
}
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
; SM35: rotl64
define i64 @rotl64(i64 %a, i64 %n) {
; SM20-LABEL: rotl64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
-; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1];
-; SM20-NEXT: and.b32 %r2, %r1, 63;
-; SM20-NEXT: shl.b64 %rd2, %rd1, %r2;
-; SM20-NEXT: neg.s32 %r3, %r1;
-; SM20-NEXT: and.b32 %r4, %r3, 63;
-; SM20-NEXT: shr.u64 %rd3, %rd1, %r4;
-; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<2>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1];
-; SM35-NEXT: and.b32 %r2, %r1, 63;
-; SM35-NEXT: shl.b64 %rd2, %rd1, %r2;
-; SM35-NEXT: neg.s32 %r3, %r1;
-; SM35-NEXT: and.b32 %r4, %r3, 63;
-; SM35-NEXT: shr.u64 %rd3, %rd1, %r4;
-; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: .reg .u32 %amt2;
+; SM35-NEXT: and.b32 %amt2, %r1, 63;
+; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
@@ -201,26 +231,34 @@ define i64 @rotl64(i64 %a, i64 %n) {
define i64 @rotl64_imm(i64 %a) {
; SM20-LABEL: rotl64_imm(
; SM20: {
-; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
-; SM20-NEXT: shr.u64 %rd2, %rd1, 62;
-; SM20-NEXT: shl.b64 %rd3, %rd1, 2;
-; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: shl.b64 %lhs, %rd1, 2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, 62;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotl64_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
-; SM35-NEXT: shr.u64 %rd2, %rd1, 62;
-; SM35-NEXT: shl.b64 %rd3, %rd1, 2;
-; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: shl.b64 %lhs, %rd1, 2;
+; SM35-NEXT: shr.b64 %rhs, %rd1, 62;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
@@ -230,36 +268,44 @@ define i64 @rotl64_imm(i64 %a) {
define i64 @rotr64(i64 %a, i64 %n) {
; SM20-LABEL: rotr64(
; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
-; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1];
-; SM20-NEXT: and.b32 %r2, %r1, 63;
-; SM20-NEXT: shr.u64 %rd2, %rd1, %r2;
-; SM20-NEXT: neg.s32 %r3, %r1;
-; SM20-NEXT: and.b32 %r4, %r3, 63;
-; SM20-NEXT: shl.b64 %rd3, %rd1, %r4;
-; SM20-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotr64(
; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b32 %r<2>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1];
-; SM35-NEXT: and.b32 %r2, %r1, 63;
-; SM35-NEXT: shr.u64 %rd2, %rd1, %r2;
-; SM35-NEXT: neg.s32 %r3, %r1;
-; SM35-NEXT: and.b32 %r4, %r3, 63;
-; SM35-NEXT: shl.b64 %rd3, %rd1, %r4;
-; SM35-NEXT: or.b64 %rd4, %rd2, %rd3;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: .reg .u32 %amt2;
+; SM35-NEXT: and.b32 %amt2, %r1, 63;
+; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
ret i64 %val
@@ -269,180 +315,35 @@ define i64 @rotr64(i64 %a, i64 %n) {
define i64 @rotr64_imm(i64 %a) {
; SM20-LABEL: rotr64_imm(
; SM20: {
-; SM20-NEXT: .reg .b64 %rd<5>;
+; SM20-NEXT: .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT: // %bb.0:
; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM20-NEXT: shl.b64 %rd2, %rd1, 62;
-; SM20-NEXT: shr.u64 %rd3, %rd1, 2;
-; SM20-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: shl.b64 %lhs, %rd1, 62;
+; SM20-NEXT: shr.b64 %rhs, %rd1, 2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM20-NEXT: ret;
;
; SM35-LABEL: rotr64_imm(
; SM35: {
-; SM35-NEXT: .reg .b64 %rd<5>;
+; SM35-NEXT: .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT: // %bb.0:
; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
-; SM35-NEXT: shl.b64 %rd2, %rd1, 62;
-; SM35-NEXT: shr.u64 %rd3, %rd1, 2;
-; SM35-NEXT: or.b64 %rd4, %rd3, %rd2;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd4;
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: shl.b64 %lhs, %rd1, 62;
+; SM35-NEXT: shr.b64 %rhs, %rd1, 2;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
; SM35-NEXT: ret;
%val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
ret i64 %val
}
-
-define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) {
-; SM20-LABEL: funnel_shift_right_32(
-; SM20: {
-; SM20-NEXT: .reg .b32 %r<11>;
-; SM20-EMPTY:
-; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0];
-; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_2];
-; SM20-NEXT: and.b32 %r3, %r2, 31;
-; SM20-NEXT: ld.param.u32 %r4, [funnel_shift_right_32_param_1];
-; SM20-NEXT: shr.u32 %r5, %r4, %r3;
-; SM20-NEXT: shl.b32 %r6, %r1, 1;
-; SM20-NEXT: not.b32 %r7, %r2;
-; SM20-NEXT: and.b32 %r8, %r7, 31;
-; SM20-NEXT: shl.b32 %r9, %r6, %r8;
-; SM20-NEXT: or.b32 %r10, %r9, %r5;
-; SM20-NEXT: st.param.b32 [func_retval0+0], %r10;
-; SM20-NEXT: ret;
-;
-; SM35-LABEL: funnel_shift_right_32(
-; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-EMPTY:
-; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0];
-; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1];
-; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2];
-; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, %r3;
-; SM35-NEXT: st.param.b32 [func_retval0+0], %r4;
-; SM35-NEXT: ret;
- %val = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
- ret i32 %val
-}
-
-define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) {
-; SM20-LABEL: funnel_shift_left_32(
-; SM20: {
-; SM20-NEXT: .reg .b32 %r<11>;
-; SM20-EMPTY:
-; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0];
-; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_2];
-; SM20-NEXT: and.b32 %r3, %r2, 31;
-; SM20-NEXT: shl.b32 %r4, %r1, %r3;
-; SM20-NEXT: ld.param.u32 %r5, [funnel_shift_left_32_param_1];
-; SM20-NEXT: shr.u32 %r6, %r5, 1;
-; SM20-NEXT: not.b32 %r7, %r2;
-; SM20-NEXT: and.b32 %r8, %r7, 31;
-; SM20-NEXT: shr.u32 %r9, %r6, %r8;
-; SM20-NEXT: or.b32 %r10, %r4, %r9;
-; SM20-NEXT: st.param.b32 [func_retval0+0], %r10;
-; SM20-NEXT: ret;
-;
-; SM35-LABEL: funnel_shift_left_32(
-; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-EMPTY:
-; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0];
-; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1];
-; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2];
-; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, %r3;
-; SM35-NEXT: st.param.b32 [func_retval0+0], %r4;
-; SM35-NEXT: ret;
- %val = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
- ret i32 %val
-}
-
-define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) {
-; SM20-LABEL: funnel_shift_right_64(
-; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
-; SM20-NEXT: .reg .b64 %rd<7>;
-; SM20-EMPTY:
-; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0];
-; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2];
-; SM20-NEXT: and.b32 %r2, %r1, 63;
-; SM20-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1];
-; SM20-NEXT: shr.u64 %rd3, %rd2, %r2;
-; SM20-NEXT: shl.b64 %rd4, %rd1, 1;
-; SM20-NEXT: not.b32 %r3, %r1;
-; SM20-NEXT: and.b32 %r4, %r3, 63;
-; SM20-NEXT: shl.b64 %rd5, %rd4, %r4;
-; SM20-NEXT: or.b64 %rd6, %rd5, %rd3;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6;
-; SM20-NEXT: ret;
-;
-; SM35-LABEL: funnel_shift_right_64(
-; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-NEXT: .reg .b64 %rd<7>;
-; SM35-EMPTY:
-; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0];
-; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2];
-; SM35-NEXT: and.b32 %r2, %r1, 63;
-; SM35-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1];
-; SM35-NEXT: shr.u64 %rd3, %rd2, %r2;
-; SM35-NEXT: shl.b64 %rd4, %rd1, 1;
-; SM35-NEXT: not.b32 %r3, %r1;
-; SM35-NEXT: and.b32 %r4, %r3, 63;
-; SM35-NEXT: shl.b64 %rd5, %rd4, %r4;
-; SM35-NEXT: or.b64 %rd6, %rd5, %rd3;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6;
-; SM35-NEXT: ret;
- %val = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
- ret i64 %val
-}
-
-define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) {
-; SM20-LABEL: funnel_shift_left_64(
-; SM20: {
-; SM20-NEXT: .reg .b32 %r<5>;
-; SM20-NEXT: .reg .b64 %rd<7>;
-; SM20-EMPTY:
-; SM20-NEXT: // %bb.0:
-; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
-; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2];
-; SM20-NEXT: and.b32 %r2, %r1, 63;
-; SM20-NEXT: shl.b64 %rd2, %rd1, %r2;
-; SM20-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
-; SM20-NEXT: shr.u64 %rd4, %rd3, 1;
-; SM20-NEXT: not.b32 %r3, %r1;
-; SM20-NEXT: and.b32 %r4, %r3, 63;
-; SM20-NEXT: shr.u64 %rd5, %rd4, %r4;
-; SM20-NEXT: or.b64 %rd6, %rd2, %rd5;
-; SM20-NEXT: st.param.b64 [func_retval0+0], %rd6;
-; SM20-NEXT: ret;
-;
-; SM35-LABEL: funnel_shift_left_64(
-; SM35: {
-; SM35-NEXT: .reg .b32 %r<5>;
-; SM35-NEXT: .reg .b64 %rd<7>;
-; SM35-EMPTY:
-; SM35-NEXT: // %bb.0:
-; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0];
-; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2];
-; SM35-NEXT: and.b32 %r2, %r1, 63;
-; SM35-NEXT: shl.b64 %rd2, %rd1, %r2;
-; SM35-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1];
-; SM35-NEXT: shr.u64 %rd4, %rd3, 1;
-; SM35-NEXT: not.b32 %r3, %r1;
-; SM35-NEXT: and.b32 %r4, %r3, 63;
-; SM35-NEXT: shr.u64 %rd5, %rd4, %r4;
-; SM35-NEXT: or.b64 %rd6, %rd2, %rd5;
-; SM35-NEXT: st.param.b64 [func_retval0+0], %rd6;
-; SM35-NEXT: ret;
- %val = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
- ret i64 %val
-}
-
diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll
index 05fdb02ac74794..64659ce1b5c56d 100644
--- a/llvm/test/CodeGen/NVPTX/rotate_64.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll
@@ -1,38 +1,25 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
declare i64 @llvm.nvvm.rotate.b64(i64, i32)
declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
+; CHECK: rotate64
define i64 @rotate64(i64 %a, i32 %b) {
-; CHECK-LABEL: rotate64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
-; CHECK-NEXT: shr.u64 %rd2, %rd1, 61;
-; CHECK-NEXT: shl.b64 %rd3, %rd1, 3;
-; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4;
-; CHECK-NEXT: ret;
+; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 3;
+; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 61;
+; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]];
+; CHECK: ret
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 3)
ret i64 %val
}
+; CHECK: rotateright64
define i64 @rotateright64(i64 %a, i32 %b) {
-; CHECK-LABEL: rotateright64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
-; CHECK-NEXT: shl.b64 %rd2, %rd1, 61;
-; CHECK-NEXT: shr.u64 %rd3, %rd1, 3;
-; CHECK-NEXT: or.b64 %rd4, %rd3, %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd4;
-; CHECK-NEXT: ret;
+; CHECK: shl.b64 [[LHS:%.*]], [[RD1:%.*]], 61;
+; CHECK: shr.b64 [[RHS:%.*]], [[RD1]], 3;
+; CHECK: add.u64 [[RD2:%.*]], [[LHS]], [[RHS]];
+; CHECK: ret
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 3)
ret i64 %val
}
More information about the llvm-commits
mailing list