[llvm] [NVPTX] Add support for clamped funnel shift intrinsics (PR #113228)

Mon Oct 21 22:29:32 PDT 2024

https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/113228

>From 52b765bec63a2bbc249dc14815d1035891bde85f Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Sun, 20 Oct 2024 01:58:01 +0000
Subject: [PATCH 1/2] [NVPTX] Add support for clamped funnel shift intrinsics

---
 llvm/docs/NVPTXUsage.rst                      | 58 +++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        | 10 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  9 +++
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 27 ++++++-
 llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll | 70 +++++++++++++++++++
 .../InstCombine/NVPTX/nvvm-intrins.ll         | 30 ++++++++
 6 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 8b0b05c0ea424e..b2af05d09a042a 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -319,6 +319,64 @@ used in the '``llvm.nvvm.idp4a.[us].u``' variants, while sign-extension is used
 with '``llvm.nvvm.idp4a.[us].s``' variants. The dot product of these 4-element
 vectors is added to ``%c`` to produce the return.
 
+Bit Manipulation Intrinsics
+---------------------------
+
+'``llvm.nvvm.fshl.clamp.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.fshl.clamp``' family of intrinsics performs a clamped funnel
+shift left. These intrinsics are very similar to '``llvm.fshl``', except the
+shift ammont is clamped at the integer width (instead of modulo it). Currently,
+only ``i32`` is supported.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.fshl.clamp``' family of intrinsic functions performs a clamped
+funnel shift left: the first two values are concatenated as { %a : %b } (%a is
+the most significant bits of the wide value), the combined value is shifted
+left, and the most significant bits are extracted to produce a result that is
+the same size as the original arguments. The shift amount is the minimum of the
+value of %c and the bit width of the integer type.
+
+'``llvm.nvvm.fshr.clamp.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.fshr.clamp``' family of intrinsics perform a clamped funnel
+shift right. These intrinsics are very similar to '``llvm.fshr``', except the
+shift ammont is clamped at the integer width (instead of modulo it). Currently,
+only ``i32`` is supported.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.fshr.clamp``' family of intrinsic functions performs a clamped
+funnel shift right: the first two values are concatenated as { %a : %b } (%a is
+the most significant bits of the wide value), the combined value is shifted
+right, and the least significant bits are extracted to produce a result that is
+the same size as the original arguments. The shift amount is the minimum of the
+value of %c and the bit width of the integer type.
 
 
 Other Intrinsics
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 7b8ffe417fccdb..b4a06f583f2c91 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1080,6 +1080,16 @@ let TargetPrefix = "nvvm" in {
     }
   }
 
+//
+// Funnel-shift
+//
+  foreach direction = ["l", "r"] in
+    def int_nvvm_fsh # direction # _clamp :
+      DefaultAttrsIntrinsic<[llvm_anyint_ty],
+        [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+        [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
+
 //
 // Convert
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b5478b8f09ceb4..5f6cba397c5352 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3535,6 +3535,15 @@ let hasSideEffects = false in {
   defm SHF_R_WRAP  : ShfInst<"r.wrap", fshr>;
 }
 
+def : Pat<(i32 (int_nvvm_fshl_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 Int32Regs:$amt))),
+          (SHF_L_CLAMP_r (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt))>;
+def : Pat<(i32 (int_nvvm_fshl_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 imm:$amt))),
+          (SHF_L_CLAMP_i (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt))>;
+def : Pat<(i32 (int_nvvm_fshr_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 Int32Regs:$amt))),
+          (SHF_R_CLAMP_r (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 Int32Regs:$amt))>;
+def : Pat<(i32 (int_nvvm_fshr_clamp (i32 Int32Regs:$hi), (i32 Int32Regs:$lo), (i32 imm:$amt))),
+          (SHF_R_CLAMP_i (i32 Int32Regs:$lo), (i32 Int32Regs:$hi), (i32 imm:$amt))>;
+
 // Count leading zeros
 let hasSideEffects = false in {
   def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index b141229dcfc733..1599fa2622b336 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -14,8 +14,12 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <optional>
 using namespace llvm;
 
@@ -134,6 +138,7 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   // simplify.
   enum SpecialCase {
     SPC_Reciprocal,
+    SCP_FunnelShiftClamp,
   };
 
   // SimplifyAction is a poor-man's variant (plus an additional flag) that
@@ -314,6 +319,10 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
     case Intrinsic::nvvm_rcp_rn_d:
       return {SPC_Reciprocal, FTZ_Any};
 
+    case Intrinsic::nvvm_fshl_clamp:
+    case Intrinsic::nvvm_fshr_clamp:
+      return {SCP_FunnelShiftClamp, FTZ_Any};
+
       // We do not currently simplify intrinsics that give an approximate
       // answer. These include:
       //
@@ -384,6 +393,22 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
     return BinaryOperator::Create(
         Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
         II->getArgOperand(0), II->getName());
+
+  case SCP_FunnelShiftClamp: {
+    // Canoncialize a clamping funnel shift to the generic llvm funnel shift
+    // when possible, as this is easier for llvm to optimize further.
+    if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())
+        return IC.replaceInstUsesWith(*II, II->getArgOperand(1));
+
+      const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;
+      const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;
+      return CallInst::Create(Intrinsic::getOrInsertDeclaration(
+                                  II->getModule(), FshIID, II->getType()),
+                              SmallVector<Value *, 3>(II->args()));
+    }
+    return nullptr;
+  }
   }
   llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
 }
diff --git a/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll
new file mode 100644
index 00000000000000..c6bdf84c2aa7ba
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx -mcpu=sm_61 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_61 | FileCheck %s
+
+target triple = "nvptx-nvidia-cuda"
+
+declare i32 @llvm.nvvm.fshr.clamp.i32(i32, i32, i32)
+declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32)
+
+define i32 @fshr_clamp_r(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: fshr_clamp_r(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [fshr_clamp_r_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [fshr_clamp_r_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [fshr_clamp_r_param_2];
+; CHECK-NEXT:    shf.r.clamp.b32 %r4, %r2, %r1, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %call
+}
+
+define i32 @fshl_clamp_r(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: fshl_clamp_r(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [fshl_clamp_r_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [fshl_clamp_r_param_1];
+; CHECK-NEXT:    ld.param.u32 %r3, [fshl_clamp_r_param_2];
+; CHECK-NEXT:    shf.l.clamp.b32 %r4, %r2, %r1, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %call
+}
+
+define i32 @fshr_clamp_i(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: fshr_clamp_i(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [fshr_clamp_i_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [fshr_clamp_i_param_1];
+; CHECK-NEXT:    shf.r.clamp.b32 %r3, %r2, %r1, 3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 3)
+  ret i32 %call
+}
+
+define i32 @fshl_clamp_i(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: fshl_clamp_i(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [fshl_clamp_i_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [fshl_clamp_i_param_1];
+; CHECK-NEXT:    shf.l.clamp.b32 %r3, %r2, %r1, 3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 3)
+  ret i32 %call
+}
diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
index 35a81fffac3a70..d88881bf3ed7f7 100644
--- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -384,6 +384,34 @@ define float @test_sqrt_rn_f_ftz(float %a) #0 {
   ret float %ret
 }
 
+; CHECK-LABEL: @test_fshl_clamp_1
+define i32 @test_fshl_clamp_1(i32 %a, i32 %b, i32 %c) {
+; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 3)
+  %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 3)
+  ret i32 %call
+}
+
+; CHECK-LABEL: @test_fshl_clamp_2
+define i32 @test_fshl_clamp_2(i32 %a, i32 %b, i32 %c) {
+; CHECK: ret i32 %b
+  %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 300)
+  ret i32 %call
+}
+
+; CHECK-LABEL: @test_fshr_clamp_1
+define i32 @test_fshr_clamp_1(i32 %a, i32 %b, i32 %c) {
+; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 29)
+  %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 3)
+  ret i32 %call
+}
+
+; CHECK-LABEL: @test_fshr_clamp_2
+define i32 @test_fshr_clamp_2(i32 %a, i32 %b, i32 %c) {
+; CHECK: ret i32 %b
+  %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 300)
+  ret i32 %call
+}
+
 declare double @llvm.nvvm.add.rn.d(double, double)
 declare float @llvm.nvvm.add.rn.f(float, float)
 declare float @llvm.nvvm.add.rn.ftz.f(float, float)
@@ -454,3 +482,5 @@ declare double @llvm.nvvm.ui2d.rn(i32)
 declare float @llvm.nvvm.ui2f.rn(i32)
 declare double @llvm.nvvm.ull2d.rn(i64)
 declare float @llvm.nvvm.ull2f.rn(i64)
+declare i32 @llvm.nvvm.fshr.clamp.i32(i32, i32, i32)
+declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32)
\ No newline at end of file

>From c2256b0c5861ee852404f4c821243ecb5d58fafb Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 22 Oct 2024 05:29:20 +0000
Subject: [PATCH 2/2] address comments

---
 llvm/docs/NVPTXUsage.rst                      | 16 ++++++-------
 .../Target/NVPTX/NVPTXTargetTransformInfo.cpp |  6 ++---
 .../InstCombine/NVPTX/nvvm-intrins.ll         | 24 +++++++++++++++----
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b2af05d09a042a..b7097308f6e890 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -330,7 +330,7 @@ Syntax:
 
 .. code-block:: llvm
 
-    declare i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c)
+    declare i32 @llvm.nvvm.fshl.clamp.i32(i32 %hi, i32 %lo, i32 %n)
 
 Overview:
 """""""""
@@ -344,11 +344,11 @@ Semantics:
 """"""""""
 
 The '``llvm.nvvm.fshl.clamp``' family of intrinsic functions performs a clamped
-funnel shift left: the first two values are concatenated as { %a : %b } (%a is
-the most significant bits of the wide value), the combined value is shifted
+funnel shift left: the first two values are concatenated as { %hi : %lo } (%hi
+is the most significant bits of the wide value), the combined value is shifted
 left, and the most significant bits are extracted to produce a result that is
 the same size as the original arguments. The shift amount is the minimum of the
-value of %c and the bit width of the integer type.
+value of %n and the bit width of the integer type.
 
 '``llvm.nvvm.fshr.clamp.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -358,7 +358,7 @@ Syntax:
 
 .. code-block:: llvm
 
-    declare i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c)
+    declare i32 @llvm.nvvm.fshr.clamp.i32(i32 %hi, i32 %lo, i32 %n)
 
 Overview:
 """""""""
@@ -372,11 +372,11 @@ Semantics:
 """"""""""
 
 The '``llvm.nvvm.fshr.clamp``' family of intrinsic functions performs a clamped
-funnel shift right: the first two values are concatenated as { %a : %b } (%a is
-the most significant bits of the wide value), the combined value is shifted
+funnel shift right: the first two values are concatenated as { %hi : %lo } (%hi
+is the most significant bits of the wide value), the combined value is shifted
 right, and the least significant bits are extracted to produce a result that is
 the same size as the original arguments. The shift amount is the minimum of the
-value of %c and the bit width of the integer type.
+value of %n and the bit width of the integer type.
 
 
 Other Intrinsics
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index 1599fa2622b336..e35ba25b47880f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -395,13 +395,13 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
         II->getArgOperand(0), II->getName());
 
   case SCP_FunnelShiftClamp: {
-    // Canoncialize a clamping funnel shift to the generic llvm funnel shift
+    // Canonicalize a clamping funnel shift to the generic llvm funnel shift
     // when possible, as this is easier for llvm to optimize further.
     if (const auto *ShiftConst = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;
       if (ShiftConst->getZExtValue() >= II->getType()->getIntegerBitWidth())
-        return IC.replaceInstUsesWith(*II, II->getArgOperand(1));
+        return IC.replaceInstUsesWith(*II, II->getArgOperand(IsLeft ? 1 : 0));
 
-      const bool IsLeft = II->getIntrinsicID() == Intrinsic::nvvm_fshl_clamp;
       const unsigned FshIID = IsLeft ? Intrinsic::fshl : Intrinsic::fshr;
       return CallInst::Create(Intrinsic::getOrInsertDeclaration(
                                   II->getModule(), FshIID, II->getType()),
diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
index d88881bf3ed7f7..a1517409e1ee15 100644
--- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
+++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll
@@ -385,33 +385,47 @@ define float @test_sqrt_rn_f_ftz(float %a) #0 {
 }
 
 ; CHECK-LABEL: @test_fshl_clamp_1
-define i32 @test_fshl_clamp_1(i32 %a, i32 %b, i32 %c) {
+define i32 @test_fshl_clamp_1(i32 %a, i32 %b) {
 ; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 3)
   %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 3)
   ret i32 %call
 }
 
 ; CHECK-LABEL: @test_fshl_clamp_2
-define i32 @test_fshl_clamp_2(i32 %a, i32 %b, i32 %c) {
+define i32 @test_fshl_clamp_2(i32 %a, i32 %b) {
 ; CHECK: ret i32 %b
   %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 300)
   ret i32 %call
 }
 
+; CHECK-LABEL: @test_fshl_clamp_3
+define i32 @test_fshl_clamp_3(i32 %a, i32 %b, i32 %c) {
+; CHECK: call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c)
+  %call = call i32 @llvm.nvvm.fshl.clamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %call
+}
+
 ; CHECK-LABEL: @test_fshr_clamp_1
-define i32 @test_fshr_clamp_1(i32 %a, i32 %b, i32 %c) {
+define i32 @test_fshr_clamp_1(i32 %a, i32 %b) {
 ; CHECK: call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 29)
   %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 3)
   ret i32 %call
 }
 
 ; CHECK-LABEL: @test_fshr_clamp_2
-define i32 @test_fshr_clamp_2(i32 %a, i32 %b, i32 %c) {
-; CHECK: ret i32 %b
+define i32 @test_fshr_clamp_2(i32 %a, i32 %b) {
+; CHECK: ret i32 %a
   %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 300)
   ret i32 %call
 }
 
+; CHECK-LABEL: @test_fshr_clamp_3
+define i32 @test_fshr_clamp_3(i32 %a, i32 %b, i32 %c) {
+; CHECK: call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c)
+  %call = call i32 @llvm.nvvm.fshr.clamp.i32(i32 %a, i32 %b, i32 %c)
+  ret i32 %call
+}
+
 declare double @llvm.nvvm.add.rn.d(double, double)
 declare float @llvm.nvvm.add.rn.f(float, float)
 declare float @llvm.nvvm.add.rn.ftz.f(float, float)