[llvm] 802a2e3 - [NVPTX] Add intrinsics for the szext instruction (#139126)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 9 07:28:59 PDT 2025
Author: Alex MacLean
Date: 2025-05-09T07:28:56-07:00
New Revision: 802a2e32ab7a91cd387cc4460d637fe922dc4f0a
URL: https://github.com/llvm/llvm-project/commit/802a2e32ab7a91cd387cc4460d637fe922dc4f0a
DIFF: https://github.com/llvm/llvm-project/commit/802a2e32ab7a91cd387cc4460d637fe922dc4f0a.diff
LOG: [NVPTX] Add intrinsics for the szext instruction (#139126)
This change adds support for `llvm.nvvm.{sext,zext}.{wrap,clamp}`
intrinsics.
Added:
llvm/test/CodeGen/NVPTX/szext.ll
Modified:
llvm/docs/NVPTXUsage.rst
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/i128.ll
Removed:
################################################################################
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index c1426823d87af..b6222300e4d4a 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -568,6 +568,36 @@ to left-shift the found bit into the most-significant bit position, otherwise
the result is the shift amount needed to right-shift the found bit into the
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
+'``llvm.nvvm.{zext,sext}.{wrap,clamp}``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
+ declare i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
+ declare i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
+ declare i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics extracts the
+low bits of the input value, and zero- or sign-extends them back to the original
+width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.{zext,sext}.{wrap,clamp}``' family of intrinsics returns
+extension of N lowest bits of operand %a. For the '``wrap``' variants, N is the
+value of operand %b modulo 32. For the '``clamp``' variants, N is the value of
+operand %b clamped to the range [0, 32]. The N lowest bits are then
+zero-extended the case of the '``zext``' variants, or sign-extended the case of
+the '``sext``' variants. If N is 0, the result is 0.
+
TMA family of Intrinsics
------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 8b87822d3fdda..2851206f2e84a 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1356,6 +1356,17 @@ let TargetPrefix = "nvvm" in {
[llvm_anyint_ty, llvm_i1_ty],
[IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+//
+// szext
+//
+ foreach ext = ["sext", "zext"] in
+ foreach mode = ["wrap", "clamp"] in
+ def int_nvvm_ # ext # _ # mode :
+ DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
//
// Convert
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 11d77599d4ac3..a384cb79d645a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -227,6 +227,7 @@ class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node,
int Size = ty.Size;
}
+def I1RT : RegTyInfo<i1, Int1Regs, i1imm, imm>;
def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>;
def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>;
def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>;
@@ -240,26 +241,33 @@ def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>;
def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
+multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
+ bit commutative, list<Predicate> requires = []> {
+ defvar asmstr = op_str # " \t$dst, $a, $b;";
+
+ def rr :
+ NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
+ asmstr,
+ [(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>,
+ Requires<requires>;
+ def ri :
+ NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
+ asmstr,
+ [(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>,
+ Requires<requires>;
+ if !not(commutative) then
+ def ir :
+ NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
+ asmstr,
+ [(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>,
+ Requires<requires>;
+}
+
// Template for instructions which take three int64, int32, or int16 args.
// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
-multiclass I3<string OpcStr, SDNode OpNode, bit commutative> {
- foreach t = [I16RT, I32RT, I64RT] in {
- defvar asmstr = OpcStr # t.Size # " \t$dst, $a, $b;";
-
- def t.Ty # rr :
- NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
- asmstr,
- [(set t.Ty:$dst, (OpNode t.Ty:$a, t.Ty:$b))]>;
- def t.Ty # ri :
- NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
- asmstr,
- [(set t.Ty:$dst, (OpNode t.RC:$a, imm:$b))]>;
- if !not(commutative) then
- def t.Ty # ir :
- NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
- asmstr,
- [(set t.Ty:$dst, (OpNode imm:$a, t.RC:$b))]>;
- }
+multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
+ foreach t = [I16RT, I32RT, I64RT] in
+ defm t.Ty# : I3Inst<op_str # t.Size, op_node, t, commutative>;
}
class I16x2<string OpcStr, SDNode OpNode> :
@@ -270,26 +278,11 @@ class I16x2<string OpcStr, SDNode OpNode> :
// Template for instructions which take 3 int args. The instructions are
// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
-multiclass ADD_SUB_INT_CARRY<string OpcStr, SDNode OpNode> {
+multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
let hasSideEffects = 1 in {
- def i32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
- def i32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
- [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
- def i64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
- [(set i64:$dst, (OpNode i64:$a, i64:$b))]>,
- Requires<[hasPTX<43>]>;
- def i64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, ".s64 \t$dst, $a, $b;"),
- [(set i64:$dst, (OpNode i64:$a, imm:$b))]>,
- Requires<[hasPTX<43>]>;
+ defm i32 : I3Inst<op_str # ".s32", op_node, I32RT, commutative>;
+ defm i64 : I3Inst<op_str # ".s64", op_node, I64RT, commutative,
+ requires = [hasPTX<43>]>;
}
}
@@ -841,31 +834,31 @@ defm SUB_i1 : ADD_SUB_i1<sub>;
// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
// also use these for unsigned arithmetic.
-defm ADD : I3<"add.s", add, /*commutative=*/ true>;
-defm SUB : I3<"sub.s", sub, /*commutative=*/ false>;
+defm ADD : I3<"add.s", add, commutative = true>;
+defm SUB : I3<"sub.s", sub, commutative = false>;
def ADD16x2 : I16x2<"add.s", add>;
// in32 and int64 addition and subtraction with carry-out.
-defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc>;
-defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc>;
+defm ADDCC : ADD_SUB_INT_CARRY<"add.cc", addc, commutative = true>;
+defm SUBCC : ADD_SUB_INT_CARRY<"sub.cc", subc, commutative = false>;
// int32 and int64 addition and subtraction with carry-in and carry-out.
-defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde>;
-defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube>;
+defm ADDCCC : ADD_SUB_INT_CARRY<"addc.cc", adde, commutative = true>;
+defm SUBCCC : ADD_SUB_INT_CARRY<"subc.cc", sube, commutative = false>;
-defm MULT : I3<"mul.lo.s", mul, /*commutative=*/ true>;
+defm MULT : I3<"mul.lo.s", mul, commutative = true>;
-defm MULTHS : I3<"mul.hi.s", mulhs, /*commutative=*/ true>;
-defm MULTHU : I3<"mul.hi.u", mulhu, /*commutative=*/ true>;
+defm MULTHS : I3<"mul.hi.s", mulhs, commutative = true>;
+defm MULTHU : I3<"mul.hi.u", mulhu, commutative = true>;
-defm SDIV : I3<"div.s", sdiv, /*commutative=*/ false>;
-defm UDIV : I3<"div.u", udiv, /*commutative=*/ false>;
+defm SDIV : I3<"div.s", sdiv, commutative = false>;
+defm UDIV : I3<"div.u", udiv, commutative = false>;
// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
// will lower it.
-defm SREM : I3<"rem.s", srem, /*commutative=*/ false>;
-defm UREM : I3<"rem.u", urem, /*commutative=*/ false>;
+defm SREM : I3<"rem.s", srem, commutative = false>;
+defm UREM : I3<"rem.u", urem, commutative = false>;
// Integer absolute value. NumBits should be one minus the bit width of RC.
// This idiom implements the algorithm at
@@ -880,10 +873,10 @@ defm ABS_32 : ABS<i32, Int32Regs, ".s32">;
defm ABS_64 : ABS<i64, Int64Regs, ".s64">;
// Integer min/max.
-defm SMAX : I3<"max.s", smax, /*commutative=*/ true>;
-defm UMAX : I3<"max.u", umax, /*commutative=*/ true>;
-defm SMIN : I3<"min.s", smin, /*commutative=*/ true>;
-defm UMIN : I3<"min.u", umin, /*commutative=*/ true>;
+defm SMAX : I3<"max.s", smax, commutative = true>;
+defm UMAX : I3<"max.u", umax, commutative = true>;
+defm SMIN : I3<"min.s", smin, commutative = true>;
+defm UMIN : I3<"min.u", umin, commutative = true>;
def SMAX16x2 : I16x2<"max.s", smax>;
def UMAX16x2 : I16x2<"max.u", umax>;
@@ -1393,38 +1386,10 @@ def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
// Template for three-arg bitwise operations. Takes three args, Creates .b16,
// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
multiclass BITWISE<string OpcStr, SDNode OpNode> {
- def b1rr :
- NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
- def b1ri :
- NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- !strconcat(OpcStr, ".pred \t$dst, $a, $b;"),
- [(set i1:$dst, (OpNode i1:$a, imm:$b))]>;
- def b16rr :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set i16:$dst, (OpNode i16:$a, i16:$b))]>;
- def b16ri :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"),
- [(set i16:$dst, (OpNode i16:$a, imm:$b))]>;
- def b32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
- def b32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, ".b32 \t$dst, $a, $b;"),
- [(set i32:$dst, (OpNode i32:$a, imm:$b))]>;
- def b64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
- !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set i64:$dst, (OpNode i64:$a, i64:$b))]>;
- def b64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
- !strconcat(OpcStr, ".b64 \t$dst, $a, $b;"),
- [(set i64:$dst, (OpNode i64:$a, imm:$b))]>;
+ defm b1 : I3Inst<OpcStr # ".pred", OpNode, I1RT, commutative = true>;
+ defm b16 : I3Inst<OpcStr # ".b16", OpNode, I16RT, commutative = true>;
+ defm b32 : I3Inst<OpcStr # ".b32", OpNode, I32RT, commutative = true>;
+ defm b64 : I3Inst<OpcStr # ".b64", OpNode, I64RT, commutative = true>;
}
defm OR : BITWISE<"or", or>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 3eedb43e4c81a..7b139d7b79e7d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1678,6 +1678,21 @@ foreach t = [I32RT, I64RT] in {
}
}
+//
+// szext
+//
+
+foreach sign = ["s", "u"] in {
+ foreach mode = ["wrap", "clamp"] in {
+ defvar ext = !if(!eq(sign, "s"), "sext", "zext");
+ defvar intrin = !cast<Intrinsic>("int_nvvm_" # ext # "_" # mode);
+ defm SZEXT_ # sign # _ # mode
+ : I3Inst<"szext." # mode # "." # sign # "32",
+ intrin, I32RT, commutative = false,
+ requires = [hasSM<70>, hasPTX<76>]>;
+ }
+}
+
//
// Convert
//
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 64786e601c4b5..f1ca19b30ac2a 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -13,37 +13,37 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
; CHECK-NEXT: shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT: mov.b64 %rd117, 0;
-; CHECK-NEXT: sub.cc.s64 %rd52, %rd117, %rd45;
-; CHECK-NEXT: subc.cc.s64 %rd53, %rd117, %rd46;
+; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
+; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
-; CHECK-NEXT: selp.b64 %rd4, %rd53, %rd46, %p1;
-; CHECK-NEXT: selp.b64 %rd3, %rd52, %rd45, %p1;
-; CHECK-NEXT: sub.cc.s64 %rd54, %rd117, %rd49;
-; CHECK-NEXT: subc.cc.s64 %rd55, %rd117, %rd50;
+; CHECK-NEXT: selp.b64 %rd4, %rd52, %rd46, %p1;
+; CHECK-NEXT: selp.b64 %rd3, %rd51, %rd45, %p1;
+; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49;
+; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50;
; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0;
-; CHECK-NEXT: selp.b64 %rd6, %rd55, %rd50, %p2;
-; CHECK-NEXT: selp.b64 %rd5, %rd54, %rd49, %p2;
-; CHECK-NEXT: or.b64 %rd56, %rd5, %rd6;
-; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0;
-; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4;
-; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0;
+; CHECK-NEXT: selp.b64 %rd6, %rd54, %rd50, %p2;
+; CHECK-NEXT: selp.b64 %rd5, %rd53, %rd49, %p2;
+; CHECK-NEXT: or.b64 %rd55, %rd5, %rd6;
+; CHECK-NEXT: setp.eq.s64 %p3, %rd55, 0;
+; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4;
+; CHECK-NEXT: setp.eq.s64 %p4, %rd56, 0;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
; CHECK-NEXT: setp.ne.s64 %p6, %rd6, 0;
; CHECK-NEXT: clz.b64 %r1, %rd6;
-; CHECK-NEXT: cvt.u64.u32 %rd58, %r1;
+; CHECK-NEXT: cvt.u64.u32 %rd57, %r1;
; CHECK-NEXT: clz.b64 %r2, %rd5;
-; CHECK-NEXT: cvt.u64.u32 %rd59, %r2;
-; CHECK-NEXT: add.s64 %rd60, %rd59, 64;
-; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6;
+; CHECK-NEXT: cvt.u64.u32 %rd58, %r2;
+; CHECK-NEXT: add.s64 %rd59, %rd58, 64;
+; CHECK-NEXT: selp.b64 %rd60, %rd57, %rd59, %p6;
; CHECK-NEXT: setp.ne.s64 %p7, %rd4, 0;
; CHECK-NEXT: clz.b64 %r3, %rd4;
-; CHECK-NEXT: cvt.u64.u32 %rd62, %r3;
+; CHECK-NEXT: cvt.u64.u32 %rd61, %r3;
; CHECK-NEXT: clz.b64 %r4, %rd3;
-; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
-; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
-; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65;
+; CHECK-NEXT: cvt.u64.u32 %rd62, %r4;
+; CHECK-NEXT: add.s64 %rd63, %rd62, 64;
+; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7;
+; CHECK-NEXT: mov.b64 %rd117, 0;
+; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64;
; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0;
@@ -314,39 +314,39 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
-; CHECK-NEXT: mov.b64 %rd112, 0;
-; CHECK-NEXT: sub.cc.s64 %rd52, %rd112, %rd45;
-; CHECK-NEXT: subc.cc.s64 %rd53, %rd112, %rd46;
+; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
+; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
-; CHECK-NEXT: selp.b64 %rd2, %rd53, %rd46, %p1;
-; CHECK-NEXT: selp.b64 %rd1, %rd52, %rd45, %p1;
-; CHECK-NEXT: sub.cc.s64 %rd54, %rd112, %rd49;
-; CHECK-NEXT: subc.cc.s64 %rd55, %rd112, %rd50;
+; CHECK-NEXT: selp.b64 %rd2, %rd52, %rd46, %p1;
+; CHECK-NEXT: selp.b64 %rd1, %rd51, %rd45, %p1;
+; CHECK-NEXT: sub.cc.s64 %rd53, 0, %rd49;
+; CHECK-NEXT: subc.cc.s64 %rd54, 0, %rd50;
; CHECK-NEXT: setp.lt.s64 %p2, %rd50, 0;
-; CHECK-NEXT: selp.b64 %rd4, %rd55, %rd50, %p2;
-; CHECK-NEXT: selp.b64 %rd3, %rd54, %rd49, %p2;
-; CHECK-NEXT: xor.b64 %rd56, %rd50, %rd46;
-; CHECK-NEXT: shr.s64 %rd5, %rd56, 63;
-; CHECK-NEXT: or.b64 %rd57, %rd3, %rd4;
-; CHECK-NEXT: setp.eq.s64 %p3, %rd57, 0;
-; CHECK-NEXT: or.b64 %rd58, %rd1, %rd2;
-; CHECK-NEXT: setp.eq.s64 %p4, %rd58, 0;
+; CHECK-NEXT: selp.b64 %rd4, %rd54, %rd50, %p2;
+; CHECK-NEXT: selp.b64 %rd3, %rd53, %rd49, %p2;
+; CHECK-NEXT: xor.b64 %rd55, %rd50, %rd46;
+; CHECK-NEXT: shr.s64 %rd5, %rd55, 63;
+; CHECK-NEXT: or.b64 %rd56, %rd3, %rd4;
+; CHECK-NEXT: setp.eq.s64 %p3, %rd56, 0;
+; CHECK-NEXT: or.b64 %rd57, %rd1, %rd2;
+; CHECK-NEXT: setp.eq.s64 %p4, %rd57, 0;
; CHECK-NEXT: or.pred %p5, %p3, %p4;
; CHECK-NEXT: setp.ne.s64 %p6, %rd4, 0;
; CHECK-NEXT: clz.b64 %r1, %rd4;
-; CHECK-NEXT: cvt.u64.u32 %rd59, %r1;
+; CHECK-NEXT: cvt.u64.u32 %rd58, %r1;
; CHECK-NEXT: clz.b64 %r2, %rd3;
-; CHECK-NEXT: cvt.u64.u32 %rd60, %r2;
-; CHECK-NEXT: add.s64 %rd61, %rd60, 64;
-; CHECK-NEXT: selp.b64 %rd62, %rd59, %rd61, %p6;
+; CHECK-NEXT: cvt.u64.u32 %rd59, %r2;
+; CHECK-NEXT: add.s64 %rd60, %rd59, 64;
+; CHECK-NEXT: selp.b64 %rd61, %rd58, %rd60, %p6;
; CHECK-NEXT: setp.ne.s64 %p7, %rd2, 0;
; CHECK-NEXT: clz.b64 %r3, %rd2;
-; CHECK-NEXT: cvt.u64.u32 %rd63, %r3;
+; CHECK-NEXT: cvt.u64.u32 %rd62, %r3;
; CHECK-NEXT: clz.b64 %r4, %rd1;
-; CHECK-NEXT: cvt.u64.u32 %rd64, %r4;
-; CHECK-NEXT: add.s64 %rd65, %rd64, 64;
-; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7;
-; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66;
+; CHECK-NEXT: cvt.u64.u32 %rd63, %r4;
+; CHECK-NEXT: add.s64 %rd64, %rd63, 64;
+; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
+; CHECK-NEXT: mov.b64 %rd112, 0;
+; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65;
; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0;
diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll
new file mode 100644
index 0000000000000..f159156c6b80f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/szext.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @szext_wrap_u32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_wrap_u32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_u32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_u32_param_1];
+; CHECK-NEXT: szext.wrap.u32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.zext.wrap(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @szext_clamp_u32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_clamp_u32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_u32_param_1];
+; CHECK-NEXT: szext.clamp.u32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @szext_wrap_s32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_wrap_s32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_s32_param_1];
+; CHECK-NEXT: szext.wrap.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.sext.wrap(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @szext_clamp_s32(i32 %a, i32 %b) {
+; CHECK-LABEL: szext_clamp_s32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_s32_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_s32_param_1];
+; CHECK-NEXT: szext.clamp.s32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.sext.clamp(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @szext_clamp_s32_ii() {
+; CHECK-LABEL: szext_clamp_s32_ii(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.b32 %r1, 3;
+; CHECK-NEXT: szext.clamp.s32 %r2, %r1, 4;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.sext.clamp(i32 3, i32 4)
+ ret i32 %c
+}
+
+define i32 @szext_wrap_s32_ir(i32 %a) {
+; CHECK-LABEL: szext_wrap_s32_ir(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_ir_param_0];
+; CHECK-NEXT: szext.wrap.s32 %r2, 5, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.sext.wrap(i32 5, i32 %a)
+ ret i32 %c
+}
+
+define i32 @szext_clamp_u32_ri(i32 %a) {
+; CHECK-LABEL: szext_clamp_u32_ri(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_ri_param_0];
+; CHECK-NEXT: szext.clamp.u32 %r2, %r1, 7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.zext.clamp(i32 %a, i32 7)
+ ret i32 %c
+}
More information about the llvm-commits
mailing list