[llvm] [NVPTX] Cleanup ISel code after float register removal, use BasicNVPTXInst (PR #141711)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Thu May 29 09:12:24 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/141711
>From 35fb8c8b64585b216c91d153f11766c93f8fc35e Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 21 May 2025 22:07:17 +0000
Subject: [PATCH 1/6] [NVPTX] untyped cleanup
---
llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp | 6 -
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 118 +--
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 74 +-
llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 598 +++++------
.../test/CodeGen/NVPTX/bf16x2-instructions.ll | 50 +-
llvm/test/CodeGen/NVPTX/f16-instructions.ll | 6 +-
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 108 +-
llvm/test/CodeGen/NVPTX/fexp2.ll | 120 +--
llvm/test/CodeGen/NVPTX/flog2.ll | 80 +-
llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 596 +++++------
.../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll | 460 ++++-----
.../NVPTX/fma-relu-instruction-flag.ll | 956 +++++++++---------
llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 10 +-
llvm/test/CodeGen/NVPTX/inline-asm.ll | 6 +-
llvm/test/CodeGen/NVPTX/math-intrins.ll | 248 ++---
llvm/test/CodeGen/NVPTX/param-add.ll | 22 +-
16 files changed, 1679 insertions(+), 1779 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
index 008209785a683..cd404819cb837 100644
--- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
@@ -53,16 +53,10 @@ static bool traverseMoveUse(MachineInstr &U, const MachineRegisterInfo &MRI,
SmallVectorImpl<MachineInstr *> &RemoveList,
SmallVectorImpl<MachineInstr *> &LoadInsts) {
switch (U.getOpcode()) {
- case NVPTX::LD_f32:
- case NVPTX::LD_f64:
case NVPTX::LD_i16:
case NVPTX::LD_i32:
case NVPTX::LD_i64:
case NVPTX::LD_i8:
- case NVPTX::LDV_f32_v2:
- case NVPTX::LDV_f32_v4:
- case NVPTX::LDV_f64_v2:
- case NVPTX::LDV_f64_v4:
case NVPTX::LDV_i16_v2:
case NVPTX::LDV_i16_v4:
case NVPTX::LDV_i32_v2:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index b05a4713e6340..b1f653f9c3aed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1015,33 +1015,29 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
// Helper function template to reduce amount of boilerplate code for
// opcode selection.
-static std::optional<unsigned> pickOpcodeForVT(
- MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
- std::optional<unsigned> Opcode_i16, std::optional<unsigned> Opcode_i32,
- std::optional<unsigned> Opcode_i64, std::optional<unsigned> Opcode_f32,
- std::optional<unsigned> Opcode_f64) {
+static std::optional<unsigned>
+pickOpcodeForVT(MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
+ std::optional<unsigned> Opcode_i16,
+ std::optional<unsigned> Opcode_i32,
+ std::optional<unsigned> Opcode_i64) {
switch (VT) {
case MVT::i1:
case MVT::i8:
return Opcode_i8;
- case MVT::i16:
- return Opcode_i16;
- case MVT::i32:
- return Opcode_i32;
- case MVT::i64:
- return Opcode_i64;
case MVT::f16:
+ case MVT::i16:
case MVT::bf16:
return Opcode_i16;
case MVT::v2f16:
case MVT::v2bf16:
case MVT::v2i16:
case MVT::v4i8:
- return Opcode_i32;
+ case MVT::i32:
case MVT::f32:
- return Opcode_f32;
+ return Opcode_i32;
+ case MVT::i64:
case MVT::f64:
- return Opcode_f64;
+ return Opcode_i64;
default:
return std::nullopt;
}
@@ -1101,9 +1097,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
Chain};
const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
- const std::optional<unsigned> Opcode =
- pickOpcodeForVT(TargetVT, NVPTX::LD_i8, NVPTX::LD_i16, NVPTX::LD_i32,
- NVPTX::LD_i64, NVPTX::LD_f32, NVPTX::LD_f64);
+ const std::optional<unsigned> Opcode = pickOpcodeForVT(
+ TargetVT, NVPTX::LD_i8, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
if (!Opcode)
return false;
@@ -1203,22 +1198,19 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
default:
return false;
case NVPTXISD::LoadV2:
- Opcode =
- pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2,
- NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2,
- NVPTX::LDV_f32_v2, NVPTX::LDV_f64_v2);
+ Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2,
+ NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2,
+ NVPTX::LDV_i64_v2);
break;
case NVPTXISD::LoadV4:
- Opcode =
- pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
- NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4,
- NVPTX::LDV_f32_v4, NVPTX::LDV_f64_v4);
+ Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
+ NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4,
+ NVPTX::LDV_i64_v4);
break;
case NVPTXISD::LoadV8:
Opcode =
pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
- {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */},
- NVPTX::LDV_f32_v8, {/* no v8f64 */});
+ {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */});
break;
}
if (!Opcode)
@@ -1286,48 +1278,42 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
- NVPTX::INT_PTX_LDG_GLOBAL_i64, NVPTX::INT_PTX_LDG_GLOBAL_f32,
- NVPTX::INT_PTX_LDG_GLOBAL_f64);
+ NVPTX::INT_PTX_LDG_GLOBAL_i64);
break;
case ISD::INTRINSIC_W_CHAIN:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
- NVPTX::INT_PTX_LDU_GLOBAL_i64, NVPTX::INT_PTX_LDU_GLOBAL_f32,
- NVPTX::INT_PTX_LDU_GLOBAL_f64);
+ NVPTX::INT_PTX_LDU_GLOBAL_i64);
break;
case NVPTXISD::LoadV2:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
- NVPTX::INT_PTX_LDG_G_v2i64_ELE, NVPTX::INT_PTX_LDG_G_v2f32_ELE,
- NVPTX::INT_PTX_LDG_G_v2f64_ELE);
+ NVPTX::INT_PTX_LDG_G_v2i64_ELE);
break;
case NVPTXISD::LDUV2:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
- NVPTX::INT_PTX_LDU_G_v2i64_ELE, NVPTX::INT_PTX_LDU_G_v2f32_ELE,
- NVPTX::INT_PTX_LDU_G_v2f64_ELE);
+ NVPTX::INT_PTX_LDU_G_v2i64_ELE);
break;
case NVPTXISD::LoadV4:
Opcode = pickOpcodeForVT(
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
- NVPTX::INT_PTX_LDG_G_v4i64_ELE, NVPTX::INT_PTX_LDG_G_v4f32_ELE,
- NVPTX::INT_PTX_LDG_G_v4f64_ELE);
+ NVPTX::INT_PTX_LDG_G_v4i64_ELE);
break;
case NVPTXISD::LDUV4:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE,
- NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE,
- {/* no v4i64 */}, NVPTX::INT_PTX_LDU_G_v4f32_ELE, {/* no v4f64 */});
+ Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+ NVPTX::INT_PTX_LDU_G_v4i8_ELE,
+ NVPTX::INT_PTX_LDU_G_v4i16_ELE,
+ NVPTX::INT_PTX_LDU_G_v4i32_ELE, {/* no v4i64 */});
break;
case NVPTXISD::LoadV8:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
{/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE,
- {/* no v8i64 */}, NVPTX::INT_PTX_LDG_G_v8f32_ELE,
- {/* no v8f64 */});
+ {/* no v8i64 */});
break;
}
if (!Opcode)
@@ -1421,9 +1407,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
const MVT::SimpleValueType SourceVT =
Value.getNode()->getSimpleValueType(0).SimpleTy;
- const std::optional<unsigned> Opcode =
- pickOpcodeForVT(SourceVT, NVPTX::ST_i8, NVPTX::ST_i16, NVPTX::ST_i32,
- NVPTX::ST_i64, NVPTX::ST_f32, NVPTX::ST_f64);
+ const std::optional<unsigned> Opcode = pickOpcodeForVT(
+ SourceVT, NVPTX::ST_i8, NVPTX::ST_i16, NVPTX::ST_i32, NVPTX::ST_i64);
if (!Opcode)
return false;
@@ -1486,22 +1471,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
default:
return false;
case NVPTXISD::StoreV2:
- Opcode =
- pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2,
- NVPTX::STV_i16_v2, NVPTX::STV_i32_v2, NVPTX::STV_i64_v2,
- NVPTX::STV_f32_v2, NVPTX::STV_f64_v2);
+ Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2,
+ NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
+ NVPTX::STV_i64_v2);
break;
case NVPTXISD::StoreV4:
- Opcode =
- pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
- NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, NVPTX::STV_i64_v4,
- NVPTX::STV_f32_v4, NVPTX::STV_f64_v4);
+ Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
+ NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
+ NVPTX::STV_i64_v4);
break;
case NVPTXISD::StoreV8:
Opcode =
pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
- {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */},
- NVPTX::STV_f32_v8, {/* no v8f64 */});
+ {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */});
break;
}
@@ -1550,21 +1532,18 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
case 1:
Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
- NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
- NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
+ NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64);
break;
case 2:
Opcode =
pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
- NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
- NVPTX::LoadParamMemV2F64);
+ NVPTX::LoadParamMemV2I64);
break;
case 4:
Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16,
- NVPTX::LoadParamMemV4I32, {/* no v4i64 */},
- NVPTX::LoadParamMemV4F32, {/* no v4f64 */});
+ NVPTX::LoadParamMemV4I32, {/* no v4i64 */});
break;
}
if (!Opcode)
@@ -1628,8 +1607,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
case 1:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
- NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
- NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+ NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64);
if (Opcode == NVPTX::StoreRetvalI8) {
// Fine tune the opcode depending on the size of the operand.
// This helps to avoid creating redundant COPY instructions in
@@ -1649,14 +1627,12 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
case 2:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
- NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
- NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
+ NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64);
break;
case 4:
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
- NVPTX::StoreRetvalV4I32, {/* no v4i64 */},
- NVPTX::StoreRetvalV4F32, {/* no v4f64 */});
+ NVPTX::StoreRetvalV4I32, {/* no v4i64 */});
break;
}
if (!Opcode)
@@ -1827,14 +1803,12 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
// Use immediate version of store param
Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
- NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
- NVPTX::StoreParamF64_i);
+ NVPTX::StoreParamI64_i);
} else
Opcode =
pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
- NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
- NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
+ NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r);
if (Opcode == NVPTX::StoreParamI8_r) {
// Fine tune the opcode depending on the size of the operand.
// This helps to avoid creating redundant COPY instructions in
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 98027c5aa9c22..27035064c1f03 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1833,9 +1833,7 @@ def IMOV64i : MOVi<Int64Regs, "b64", i64, i64imm, imm>;
def IMOV128r : MOVr<Int128Regs, "b128">;
def FMOV16i : MOVi<Int16Regs, "b16", f16, f16imm, fpimm>;
def BFMOV16i : MOVi<Int16Regs, "b16", bf16, bf16imm, fpimm>;
-def FMOV32r : MOVr<Float32Regs, "b32">;
def FMOV32i : MOVi<Float32Regs, "b32", f32, f32imm, fpimm>;
-def FMOV64r : MOVr<Float64Regs, "b64">;
def FMOV64i : MOVi<Float64Regs, "b64", f64, f64imm, fpimm>;
def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32i texternalsym:$dst)>;
@@ -2292,11 +2290,6 @@ def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
-def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".b32">;
-def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".b64">;
-def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".b32">;
-def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".b64">;
-def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".b32">;
defm StoreParamI64 : StoreParamInst<Int64Regs, i64imm, ".b64">;
defm StoreParamI32 : StoreParamInst<Int32Regs, i32imm, ".b32">;
@@ -2337,12 +2330,6 @@ def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
-def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".b64">;
-def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".b32">;
-def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".b64">;
-def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".b32">;
-def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".b32">;
-
def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
def CallArgEndInst0 : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
@@ -2444,8 +2431,6 @@ let mayLoad=1, hasSideEffects=0 in {
def LD_i16 : LD<Int16Regs>;
def LD_i32 : LD<Int32Regs>;
def LD_i64 : LD<Int64Regs>;
- def LD_f32 : LD<Float32Regs>;
- def LD_f64 : LD<Float64Regs>;
}
class ST<NVPTXRegClass regclass>
@@ -2461,8 +2446,6 @@ let mayStore=1, hasSideEffects=0 in {
def ST_i16 : ST<Int16Regs>;
def ST_i32 : ST<Int32Regs>;
def ST_i64 : ST<Int64Regs>;
- def ST_f32 : ST<Float32Regs>;
- def ST_f64 : ST<Float64Regs>;
}
// The following is used only in and after vector elementizations. Vector
@@ -2496,8 +2479,6 @@ let mayLoad=1, hasSideEffects=0 in {
defm LDV_i16 : LD_VEC<Int16Regs>;
defm LDV_i32 : LD_VEC<Int32Regs, support_v8 = true>;
defm LDV_i64 : LD_VEC<Int64Regs>;
- defm LDV_f32 : LD_VEC<Float32Regs, support_v8 = true>;
- defm LDV_f64 : LD_VEC<Float64Regs>;
}
multiclass ST_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
@@ -2532,45 +2513,16 @@ let mayStore=1, hasSideEffects=0 in {
defm STV_i16 : ST_VEC<Int16Regs>;
defm STV_i32 : ST_VEC<Int32Regs, support_v8 = true>;
defm STV_i64 : ST_VEC<Int64Regs>;
- defm STV_f32 : ST_VEC<Float32Regs, support_v8 = true>;
- defm STV_f64 : ST_VEC<Float64Regs>;
}
//---- Conversion ----
-class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
- NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
- NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
- NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
- !strconcat("mov.b", SzStr, " \t$d, $a;"),
- [(set TOut:$d, (bitconvert TIn:$a))]>;
-
-def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
-def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
-def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
-def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
-
-foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
-def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
- (BITCONVERT_32_F2I $a)>;
-def: Pat<(f32 (bitconvert vt:$a)),
- (BITCONVERT_32_I2F $a)>;
-}
-foreach vt = [f16, bf16] in {
- def: Pat<(vt (bitconvert i16:$a)),
- (vt Int16Regs:$a)>;
- def: Pat<(i16 (bitconvert vt:$a)),
- (i16 Int16Regs:$a)>;
-}
-
-foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
- foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
- if !ne(ta, tb) then {
- def: Pat<(ta (bitconvert tb:$a)),
- (ta Int32Regs:$a)>;
- }
- }
-}
+foreach rc = [Int16Regs, Int32Regs, Int64Regs] in
+ foreach ta = rc.RegTypes in
+ foreach tb = rc.RegTypes in
+ if !ne(ta, tb) then
+ def : Pat<(ta (bitconvert tb:$a)),
+ (ta rc:$a)>;
// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
// we cannot specify floating-point literals in isel patterns. Therefore, we
@@ -2699,7 +2651,7 @@ def : Pat<(i64 (fp_to_uint bf16:$a)),
(CVT_u64_bf16 $a, CvtRZI)>;
// f32 -> sint
def : Pat<(i1 (fp_to_sint f32:$a)),
- (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
+ (SETP_b32ri $a, 0, CmpEQ)>;
def : Pat<(i16 (fp_to_sint f32:$a)),
(CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(i16 (fp_to_sint f32:$a)),
@@ -2715,7 +2667,7 @@ def : Pat<(i64 (fp_to_sint f32:$a)),
// f32 -> uint
def : Pat<(i1 (fp_to_uint f32:$a)),
- (SETP_b32ri (BITCONVERT_32_F2I $a), 0, CmpEQ)>;
+ (SETP_b32ri $a, 0, CmpEQ)>;
def : Pat<(i16 (fp_to_uint f32:$a)),
(CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(i16 (fp_to_uint f32:$a)),
@@ -2731,7 +2683,7 @@ def : Pat<(i64 (fp_to_uint f32:$a)),
// f64 -> sint
def : Pat<(i1 (fp_to_sint f64:$a)),
- (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
+ (SETP_b64ri $a, 0, CmpEQ)>;
def : Pat<(i16 (fp_to_sint f64:$a)),
(CVT_s16_f64 $a, CvtRZI)>;
def : Pat<(i32 (fp_to_sint f64:$a)),
@@ -2741,7 +2693,7 @@ def : Pat<(i64 (fp_to_sint f64:$a)),
// f64 -> uint
def : Pat<(i1 (fp_to_uint f64:$a)),
- (SETP_b64ri (BITCONVERT_64_F2I $a), 0, CmpEQ)>;
+ (SETP_b64ri $a, 0, CmpEQ)>;
def : Pat<(i16 (fp_to_uint f64:$a)),
(CVT_u16_f64 $a, CvtRZI)>;
def : Pat<(i32 (fp_to_uint f64:$a)),
@@ -2845,9 +2797,6 @@ let hasSideEffects = false in {
def V2I64toI128 : NVPTXInst<(outs Int128Regs:$d),
(ins Int64Regs:$s1, Int64Regs:$s2),
"mov.b128 \t$d, {{$s1, $s2}};", []>;
- def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
- (ins Float32Regs:$s1, Float32Regs:$s2),
- "mov.b64 \t$d, {{$s1, $s2}};", []>;
// unpack a larger int register to a set of smaller int registers
def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
@@ -2863,9 +2812,6 @@ let hasSideEffects = false in {
def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
(ins Int128Regs:$s),
"mov.b128 \t{{$d1, $d2}}, $s;", []>;
- def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
- (ins Float64Regs:$s),
- "mov.b64 \t{{$d1, $d2}}, $s;", []>;
def I32toI16H : NVPTXInst<(outs Int16Regs:$high),
(ins Int32Regs:$s),
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 2854ea4b79302..32225ed04e2d9 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -18,21 +18,21 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b32 %r<11>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_fadd_param_1];
-; SM70-NEXT: shl.b32 %r14, %r1, 16;
-; SM70-NEXT: ld.param.b16 %r4, [test_fadd_param_0];
-; SM70-NEXT: shl.b32 %r15, %r4, 16;
-; SM70-NEXT: add.rn.f32 %r16, %r15, %r14;
-; SM70-NEXT: bfe.u32 %r9, %r16, 16, 1;
-; SM70-NEXT: add.s32 %r10, %r9, %r16;
-; SM70-NEXT: add.s32 %r11, %r10, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r16, %r16;
-; SM70-NEXT: or.b32 %r12, %r16, 4194304;
-; SM70-NEXT: selp.b32 %r13, %r12, %r11, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r13;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: ld.param.b16 %r3, [test_fadd_param_0];
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
+; SM70-NEXT: add.rn.f32 %r5, %r4, %r2;
+; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1;
+; SM70-NEXT: add.s32 %r7, %r6, %r5;
+; SM70-NEXT: add.s32 %r8, %r7, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r5, %r5;
+; SM70-NEXT: or.b32 %r9, %r5, 4194304;
+; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -82,21 +82,21 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b32 %r<11>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_fsub_param_1];
-; SM70-NEXT: shl.b32 %r14, %r1, 16;
-; SM70-NEXT: ld.param.b16 %r4, [test_fsub_param_0];
-; SM70-NEXT: shl.b32 %r15, %r4, 16;
-; SM70-NEXT: sub.rn.f32 %r16, %r15, %r14;
-; SM70-NEXT: bfe.u32 %r9, %r16, 16, 1;
-; SM70-NEXT: add.s32 %r10, %r9, %r16;
-; SM70-NEXT: add.s32 %r11, %r10, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r16, %r16;
-; SM70-NEXT: or.b32 %r12, %r16, 4194304;
-; SM70-NEXT: selp.b32 %r13, %r12, %r11, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r13;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: ld.param.b16 %r3, [test_fsub_param_0];
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
+; SM70-NEXT: sub.rn.f32 %r5, %r4, %r2;
+; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1;
+; SM70-NEXT: add.s32 %r7, %r6, %r5;
+; SM70-NEXT: add.s32 %r8, %r7, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r5, %r5;
+; SM70-NEXT: or.b32 %r9, %r5, 4194304;
+; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -146,37 +146,37 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
-; SM70-NEXT: .reg .b32 %r<36>;
+; SM70-NEXT: .reg .b32 %r<24>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT: shl.b32 %r30, %r3, 16;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r6, %rs4;
-; SM70-NEXT: shl.b32 %r31, %r6, 16;
-; SM70-NEXT: add.rn.f32 %r32, %r31, %r30;
-; SM70-NEXT: bfe.u32 %r11, %r32, 16, 1;
-; SM70-NEXT: add.s32 %r12, %r11, %r32;
-; SM70-NEXT: add.s32 %r13, %r12, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r32, %r32;
-; SM70-NEXT: or.b32 %r14, %r32, 4194304;
-; SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
-; SM70-NEXT: cvt.u32.u16 %r16, %rs1;
-; SM70-NEXT: shl.b32 %r33, %r16, 16;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs3;
-; SM70-NEXT: shl.b32 %r34, %r19, 16;
-; SM70-NEXT: add.rn.f32 %r35, %r34, %r33;
-; SM70-NEXT: bfe.u32 %r24, %r35, 16, 1;
-; SM70-NEXT: add.s32 %r25, %r24, %r35;
-; SM70-NEXT: add.s32 %r26, %r25, 32767;
-; SM70-NEXT: setp.nan.f32 %p2, %r35, %r35;
-; SM70-NEXT: or.b32 %r27, %r35, 4194304;
-; SM70-NEXT: selp.b32 %r28, %r27, %r26, %p2;
-; SM70-NEXT: prmt.b32 %r29, %r28, %r15, 0x7632U;
-; SM70-NEXT: st.param.b32 [func_retval0], %r29;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: add.rn.f32 %r7, %r6, %r4;
+; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; SM70-NEXT: add.s32 %r9, %r8, %r7;
+; SM70-NEXT: add.s32 %r10, %r9, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: shl.b32 %r16, %r15, 16;
+; SM70-NEXT: add.rn.f32 %r17, %r16, %r14;
+; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
+; SM70-NEXT: add.s32 %r19, %r18, %r17;
+; SM70-NEXT: add.s32 %r20, %r19, 32767;
+; SM70-NEXT: setp.nan.f32 %p2, %r17, %r17;
+; SM70-NEXT: or.b32 %r21, %r17, 4194304;
+; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
+; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
+; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_faddx2(
@@ -230,37 +230,37 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
-; SM70-NEXT: .reg .b32 %r<36>;
+; SM70-NEXT: .reg .b32 %r<24>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT: shl.b32 %r30, %r3, 16;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r6, %rs4;
-; SM70-NEXT: shl.b32 %r31, %r6, 16;
-; SM70-NEXT: sub.rn.f32 %r32, %r31, %r30;
-; SM70-NEXT: bfe.u32 %r11, %r32, 16, 1;
-; SM70-NEXT: add.s32 %r12, %r11, %r32;
-; SM70-NEXT: add.s32 %r13, %r12, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r32, %r32;
-; SM70-NEXT: or.b32 %r14, %r32, 4194304;
-; SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
-; SM70-NEXT: cvt.u32.u16 %r16, %rs1;
-; SM70-NEXT: shl.b32 %r33, %r16, 16;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs3;
-; SM70-NEXT: shl.b32 %r34, %r19, 16;
-; SM70-NEXT: sub.rn.f32 %r35, %r34, %r33;
-; SM70-NEXT: bfe.u32 %r24, %r35, 16, 1;
-; SM70-NEXT: add.s32 %r25, %r24, %r35;
-; SM70-NEXT: add.s32 %r26, %r25, 32767;
-; SM70-NEXT: setp.nan.f32 %p2, %r35, %r35;
-; SM70-NEXT: or.b32 %r27, %r35, 4194304;
-; SM70-NEXT: selp.b32 %r28, %r27, %r26, %p2;
-; SM70-NEXT: prmt.b32 %r29, %r28, %r15, 0x7632U;
-; SM70-NEXT: st.param.b32 [func_retval0], %r29;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: sub.rn.f32 %r7, %r6, %r4;
+; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; SM70-NEXT: add.s32 %r9, %r8, %r7;
+; SM70-NEXT: add.s32 %r10, %r9, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: shl.b32 %r16, %r15, 16;
+; SM70-NEXT: sub.rn.f32 %r17, %r16, %r14;
+; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
+; SM70-NEXT: add.s32 %r19, %r18, %r17;
+; SM70-NEXT: add.s32 %r20, %r19, 32767;
+; SM70-NEXT: setp.nan.f32 %p2, %r17, %r17;
+; SM70-NEXT: or.b32 %r21, %r17, 4194304;
+; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
+; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
+; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_fsubx2(
@@ -314,37 +314,37 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
-; SM70-NEXT: .reg .b32 %r<36>;
+; SM70-NEXT: .reg .b32 %r<24>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT: shl.b32 %r30, %r3, 16;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r6, %rs4;
-; SM70-NEXT: shl.b32 %r31, %r6, 16;
-; SM70-NEXT: mul.rn.f32 %r32, %r31, %r30;
-; SM70-NEXT: bfe.u32 %r11, %r32, 16, 1;
-; SM70-NEXT: add.s32 %r12, %r11, %r32;
-; SM70-NEXT: add.s32 %r13, %r12, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r32, %r32;
-; SM70-NEXT: or.b32 %r14, %r32, 4194304;
-; SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
-; SM70-NEXT: cvt.u32.u16 %r16, %rs1;
-; SM70-NEXT: shl.b32 %r33, %r16, 16;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs3;
-; SM70-NEXT: shl.b32 %r34, %r19, 16;
-; SM70-NEXT: mul.rn.f32 %r35, %r34, %r33;
-; SM70-NEXT: bfe.u32 %r24, %r35, 16, 1;
-; SM70-NEXT: add.s32 %r25, %r24, %r35;
-; SM70-NEXT: add.s32 %r26, %r25, 32767;
-; SM70-NEXT: setp.nan.f32 %p2, %r35, %r35;
-; SM70-NEXT: or.b32 %r27, %r35, 4194304;
-; SM70-NEXT: selp.b32 %r28, %r27, %r26, %p2;
-; SM70-NEXT: prmt.b32 %r29, %r28, %r15, 0x7632U;
-; SM70-NEXT: st.param.b32 [func_retval0], %r29;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: mul.rn.f32 %r7, %r6, %r4;
+; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; SM70-NEXT: add.s32 %r9, %r8, %r7;
+; SM70-NEXT: add.s32 %r10, %r9, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: shl.b32 %r16, %r15, 16;
+; SM70-NEXT: mul.rn.f32 %r17, %r16, %r14;
+; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
+; SM70-NEXT: add.s32 %r19, %r18, %r17;
+; SM70-NEXT: add.s32 %r20, %r19, 32767;
+; SM70-NEXT: setp.nan.f32 %p2, %r17, %r17;
+; SM70-NEXT: or.b32 %r21, %r17, 4194304;
+; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
+; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
+; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_fmulx2(
@@ -398,37 +398,37 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
-; SM70-NEXT: .reg .b32 %r<36>;
+; SM70-NEXT: .reg .b32 %r<24>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT: shl.b32 %r30, %r3, 16;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r6, %rs4;
-; SM70-NEXT: shl.b32 %r31, %r6, 16;
-; SM70-NEXT: div.rn.f32 %r32, %r31, %r30;
-; SM70-NEXT: bfe.u32 %r11, %r32, 16, 1;
-; SM70-NEXT: add.s32 %r12, %r11, %r32;
-; SM70-NEXT: add.s32 %r13, %r12, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r32, %r32;
-; SM70-NEXT: or.b32 %r14, %r32, 4194304;
-; SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
-; SM70-NEXT: cvt.u32.u16 %r16, %rs1;
-; SM70-NEXT: shl.b32 %r33, %r16, 16;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs3;
-; SM70-NEXT: shl.b32 %r34, %r19, 16;
-; SM70-NEXT: div.rn.f32 %r35, %r34, %r33;
-; SM70-NEXT: bfe.u32 %r24, %r35, 16, 1;
-; SM70-NEXT: add.s32 %r25, %r24, %r35;
-; SM70-NEXT: add.s32 %r26, %r25, 32767;
-; SM70-NEXT: setp.nan.f32 %p2, %r35, %r35;
-; SM70-NEXT: or.b32 %r27, %r35, 4194304;
-; SM70-NEXT: selp.b32 %r28, %r27, %r26, %p2;
-; SM70-NEXT: prmt.b32 %r29, %r28, %r15, 0x7632U;
-; SM70-NEXT: st.param.b32 [func_retval0], %r29;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: div.rn.f32 %r7, %r6, %r4;
+; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; SM70-NEXT: add.s32 %r9, %r8, %r7;
+; SM70-NEXT: add.s32 %r10, %r9, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: shl.b32 %r16, %r15, 16;
+; SM70-NEXT: div.rn.f32 %r17, %r16, %r14;
+; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
+; SM70-NEXT: add.s32 %r19, %r18, %r17;
+; SM70-NEXT: add.s32 %r20, %r19, 32767;
+; SM70-NEXT: setp.nan.f32 %p2, %r17, %r17;
+; SM70-NEXT: or.b32 %r21, %r17, 4194304;
+; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
+; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
+; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_fdiv(
@@ -523,12 +523,12 @@ define bfloat @test_extract_1(<2 x bfloat> %a) #0 {
define float @test_fpext_float(bfloat %a) #0 {
; SM70-LABEL: test_fpext_float(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<5>;
+; SM70-NEXT: .reg .b32 %r<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_fpext_float_param_0];
-; SM70-NEXT: shl.b32 %r4, %r1, 16;
-; SM70-NEXT: st.param.b32 [func_retval0], %r4;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_fpext_float(
@@ -572,17 +572,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b32 %r<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.b32 %r8, [test_fptrunc_float_param_0];
-; SM70-NEXT: bfe.u32 %r3, %r8, 16, 1;
-; SM70-NEXT: add.s32 %r4, %r3, %r8;
-; SM70-NEXT: add.s32 %r5, %r4, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r8, %r8;
-; SM70-NEXT: or.b32 %r6, %r8, 4194304;
-; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r7;
+; SM70-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
+; SM70-NEXT: add.s32 %r3, %r2, %r1;
+; SM70-NEXT: add.s32 %r4, %r3, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; SM70-NEXT: or.b32 %r5, %r1, 4194304;
+; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -627,19 +627,19 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<13>;
+; SM70-NEXT: .reg .b32 %r<9>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_fadd_imm_1_param_0];
-; SM70-NEXT: shl.b32 %r11, %r1, 16;
-; SM70-NEXT: add.rn.f32 %r12, %r11, 0f3F800000;
-; SM70-NEXT: bfe.u32 %r6, %r12, 16, 1;
-; SM70-NEXT: add.s32 %r7, %r6, %r12;
-; SM70-NEXT: add.s32 %r8, %r7, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
-; SM70-NEXT: or.b32 %r9, %r12, 4194304;
-; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: add.rn.f32 %r3, %r2, 0f3F800000;
+; SM70-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; SM70-NEXT: add.s32 %r5, %r4, %r3;
+; SM70-NEXT: add.s32 %r6, %r5, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r3, %r3;
+; SM70-NEXT: or.b32 %r7, %r3, 4194304;
+; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -706,7 +706,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM70-LABEL: test_extload_bf16x8(
; SM70: {
; SM70-NEXT: .reg .b16 %rs<9>;
-; SM70-NEXT: .reg .b32 %r<37>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
@@ -717,23 +717,23 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
-; SM70-NEXT: shl.b32 %r29, %r5, 16;
-; SM70-NEXT: cvt.u32.u16 %r8, %rs7;
-; SM70-NEXT: shl.b32 %r30, %r8, 16;
-; SM70-NEXT: cvt.u32.u16 %r11, %rs6;
-; SM70-NEXT: shl.b32 %r31, %r11, 16;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs5;
-; SM70-NEXT: shl.b32 %r32, %r14, 16;
-; SM70-NEXT: cvt.u32.u16 %r17, %rs4;
-; SM70-NEXT: shl.b32 %r33, %r17, 16;
-; SM70-NEXT: cvt.u32.u16 %r20, %rs3;
-; SM70-NEXT: shl.b32 %r34, %r20, 16;
-; SM70-NEXT: cvt.u32.u16 %r23, %rs2;
-; SM70-NEXT: shl.b32 %r35, %r23, 16;
-; SM70-NEXT: cvt.u32.u16 %r26, %rs1;
-; SM70-NEXT: shl.b32 %r36, %r26, 16;
-; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r36, %r35, %r34, %r33};
-; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r32, %r31, %r30, %r29};
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
+; SM70-NEXT: shl.b32 %r8, %r7, 16;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
+; SM70-NEXT: shl.b32 %r10, %r9, 16;
+; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
+; SM70-NEXT: shl.b32 %r12, %r11, 16;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: shl.b32 %r16, %r15, 16;
+; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
+; SM70-NEXT: shl.b32 %r18, %r17, 16;
+; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
+; SM70-NEXT: shl.b32 %r20, %r19, 16;
+; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r20, %r18, %r16, %r14};
+; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r10, %r8, %r6};
; SM70-NEXT: ret;
;
; SM80-LABEL: test_extload_bf16x8(
@@ -819,14 +819,14 @@ define i16 @test_fptosi_i16(bfloat %a) {
; SM70-LABEL: test_fptosi_i16(
; SM70: {
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b32 %r<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_fptosi_i16_param_0];
-; SM70-NEXT: shl.b32 %r5, %r1, 16;
-; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %r5;
-; SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; SM70-NEXT: st.param.b32 [func_retval0], %r4;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %r2;
+; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r3;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_fptosi_i16(
@@ -874,14 +874,14 @@ define i16 @test_fptoui_i16(bfloat %a) {
; SM70-LABEL: test_fptoui_i16(
; SM70: {
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b32 %r<4>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_fptoui_i16_param_0];
-; SM70-NEXT: shl.b32 %r5, %r1, 16;
-; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %r5;
-; SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; SM70-NEXT: st.param.b32 [func_retval0], %r4;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %r2;
+; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r3;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_fptoui_i16(
@@ -930,18 +930,18 @@ define bfloat @test_sitofp_i16(i16 %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b32 %r<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
-; SM70-NEXT: cvt.rn.f32.s16 %r8, %rs1;
-; SM70-NEXT: bfe.u32 %r3, %r8, 16, 1;
-; SM70-NEXT: add.s32 %r4, %r3, %r8;
-; SM70-NEXT: add.s32 %r5, %r4, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r8, %r8;
-; SM70-NEXT: or.b32 %r6, %r8, 4194304;
-; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
-; SM70-NEXT: mov.b32 {_, %rs2}, %r7;
+; SM70-NEXT: cvt.rn.f32.s16 %r1, %rs1;
+; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
+; SM70-NEXT: add.s32 %r3, %r2, %r1;
+; SM70-NEXT: add.s32 %r4, %r3, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; SM70-NEXT: or.b32 %r5, %r1, 4194304;
+; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
+; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
;
@@ -987,18 +987,18 @@ define bfloat @test_uitofp_i8(i8 %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b32 %r<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
-; SM70-NEXT: cvt.rn.f32.u16 %r8, %rs1;
-; SM70-NEXT: bfe.u32 %r3, %r8, 16, 1;
-; SM70-NEXT: add.s32 %r4, %r3, %r8;
-; SM70-NEXT: add.s32 %r5, %r4, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r8, %r8;
-; SM70-NEXT: or.b32 %r6, %r8, 4194304;
-; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
-; SM70-NEXT: mov.b32 {_, %rs2}, %r7;
+; SM70-NEXT: cvt.rn.f32.u16 %r1, %rs1;
+; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
+; SM70-NEXT: add.s32 %r3, %r2, %r1;
+; SM70-NEXT: add.s32 %r4, %r3, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; SM70-NEXT: or.b32 %r5, %r1, 4194304;
+; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
+; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
;
@@ -1044,21 +1044,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<4>;
-; SM70-NEXT: .reg .b32 %r<10>;
+; SM70-NEXT: .reg .b32 %r<8>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
; SM70-NEXT: and.b16 %rs2, %rs1, 1;
; SM70-NEXT: setp.ne.b16 %p1, %rs2, 0;
; SM70-NEXT: selp.b32 %r1, 1, 0, %p1;
-; SM70-NEXT: cvt.rn.f32.u32 %r9, %r1;
-; SM70-NEXT: bfe.u32 %r4, %r9, 16, 1;
-; SM70-NEXT: add.s32 %r5, %r4, %r9;
-; SM70-NEXT: add.s32 %r6, %r5, 32767;
-; SM70-NEXT: setp.nan.f32 %p2, %r9, %r9;
-; SM70-NEXT: or.b32 %r7, %r9, 4194304;
-; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p2;
-; SM70-NEXT: mov.b32 {_, %rs3}, %r8;
+; SM70-NEXT: cvt.rn.f32.u32 %r2, %r1;
+; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1;
+; SM70-NEXT: add.s32 %r4, %r3, %r2;
+; SM70-NEXT: add.s32 %r5, %r4, 32767;
+; SM70-NEXT: setp.nan.f32 %p2, %r2, %r2;
+; SM70-NEXT: or.b32 %r6, %r2, 4194304;
+; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p2;
+; SM70-NEXT: mov.b32 {_, %rs3}, %r7;
; SM70-NEXT: st.param.b16 [func_retval0], %rs3;
; SM70-NEXT: ret;
;
@@ -1117,18 +1117,18 @@ define bfloat @test_uitofp_i16(i16 %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<3>;
-; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b32 %r<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
-; SM70-NEXT: cvt.rn.f32.u16 %r8, %rs1;
-; SM70-NEXT: bfe.u32 %r3, %r8, 16, 1;
-; SM70-NEXT: add.s32 %r4, %r3, %r8;
-; SM70-NEXT: add.s32 %r5, %r4, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r8, %r8;
-; SM70-NEXT: or.b32 %r6, %r8, 4194304;
-; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
-; SM70-NEXT: mov.b32 {_, %rs2}, %r7;
+; SM70-NEXT: cvt.rn.f32.u16 %r1, %rs1;
+; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
+; SM70-NEXT: add.s32 %r3, %r2, %r1;
+; SM70-NEXT: add.s32 %r4, %r3, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; SM70-NEXT: or.b32 %r5, %r1, 4194304;
+; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
+; SM70-NEXT: mov.b32 {_, %rs2}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; SM70-NEXT: ret;
;
@@ -1174,18 +1174,18 @@ define bfloat @test_uitofp_i32(i32 %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<10>;
+; SM70-NEXT: .reg .b32 %r<8>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
-; SM70-NEXT: cvt.rn.f32.u32 %r9, %r1;
-; SM70-NEXT: bfe.u32 %r4, %r9, 16, 1;
-; SM70-NEXT: add.s32 %r5, %r4, %r9;
-; SM70-NEXT: add.s32 %r6, %r5, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r9, %r9;
-; SM70-NEXT: or.b32 %r7, %r9, 4194304;
-; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
+; SM70-NEXT: cvt.rn.f32.u32 %r2, %r1;
+; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1;
+; SM70-NEXT: add.s32 %r4, %r3, %r2;
+; SM70-NEXT: add.s32 %r5, %r4, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r2, %r2;
+; SM70-NEXT: or.b32 %r6, %r2, 4194304;
+; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r7;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -1232,19 +1232,19 @@ define bfloat @test_uitofp_i64(i64 %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b32 %r<7>;
; SM70-NEXT: .reg .b64 %rd<2>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
-; SM70-NEXT: cvt.rn.f32.u64 %r8, %rd1;
-; SM70-NEXT: bfe.u32 %r3, %r8, 16, 1;
-; SM70-NEXT: add.s32 %r4, %r3, %r8;
-; SM70-NEXT: add.s32 %r5, %r4, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r8, %r8;
-; SM70-NEXT: or.b32 %r6, %r8, 4194304;
-; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r7;
+; SM70-NEXT: cvt.rn.f32.u64 %r1, %rd1;
+; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
+; SM70-NEXT: add.s32 %r3, %r2, %r1;
+; SM70-NEXT: add.s32 %r4, %r3, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; SM70-NEXT: or.b32 %r5, %r1, 4194304;
+; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r6;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -1293,19 +1293,19 @@ define bfloat @test_roundeven(bfloat %a) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<13>;
+; SM70-NEXT: .reg .b32 %r<9>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_roundeven_param_0];
-; SM70-NEXT: shl.b32 %r11, %r1, 16;
-; SM70-NEXT: cvt.rni.f32.f32 %r12, %r11;
-; SM70-NEXT: bfe.u32 %r6, %r12, 16, 1;
-; SM70-NEXT: add.s32 %r7, %r6, %r12;
-; SM70-NEXT: add.s32 %r8, %r7, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
-; SM70-NEXT: or.b32 %r9, %r12, 4194304;
-; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: cvt.rni.f32.f32 %r3, %r2;
+; SM70-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; SM70-NEXT: add.s32 %r5, %r4, %r3;
+; SM70-NEXT: add.s32 %r6, %r5, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r3, %r3;
+; SM70-NEXT: or.b32 %r7, %r3, 4194304;
+; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r8;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -1353,26 +1353,26 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
; SM70: {
; SM70-NEXT: .reg .pred %p<6>;
; SM70-NEXT: .reg .b16 %rs<8>;
-; SM70-NEXT: .reg .b32 %r<13>;
+; SM70-NEXT: .reg .b32 %r<7>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
; SM70-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
; SM70-NEXT: cvt.u32.u16 %r1, %rs2;
-; SM70-NEXT: shl.b32 %r10, %r1, 16;
-; SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; SM70-NEXT: shl.b32 %r11, %r4, 16;
-; SM70-NEXT: setp.gt.f32 %p1, %r11, %r10;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
+; SM70-NEXT: setp.gt.f32 %p1, %r4, %r2;
; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
-; SM70-NEXT: setp.nan.f32 %p2, %r11, %r10;
+; SM70-NEXT: setp.nan.f32 %p2, %r4, %r2;
; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
; SM70-NEXT: setp.eq.s16 %p3, %rs1, 0;
; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3;
; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
; SM70-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4;
-; SM70-NEXT: cvt.u32.u16 %r7, %rs4;
-; SM70-NEXT: shl.b32 %r12, %r7, 16;
-; SM70-NEXT: setp.eq.f32 %p5, %r12, 0f00000000;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: setp.eq.f32 %p5, %r6, 0f00000000;
; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
; SM70-NEXT: st.param.b16 [func_retval0], %rs7;
; SM70-NEXT: ret;
@@ -1418,21 +1418,21 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
; SM70: {
; SM70-NEXT: .reg .pred %p<2>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b32 %r<11>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b16 %r1, [test_maxnum_param_1];
-; SM70-NEXT: shl.b32 %r14, %r1, 16;
-; SM70-NEXT: ld.param.b16 %r4, [test_maxnum_param_0];
-; SM70-NEXT: shl.b32 %r15, %r4, 16;
-; SM70-NEXT: max.f32 %r16, %r15, %r14;
-; SM70-NEXT: bfe.u32 %r9, %r16, 16, 1;
-; SM70-NEXT: add.s32 %r10, %r9, %r16;
-; SM70-NEXT: add.s32 %r11, %r10, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r16, %r16;
-; SM70-NEXT: or.b32 %r12, %r16, 4194304;
-; SM70-NEXT: selp.b32 %r13, %r12, %r11, %p1;
-; SM70-NEXT: mov.b32 {_, %rs1}, %r13;
+; SM70-NEXT: shl.b32 %r2, %r1, 16;
+; SM70-NEXT: ld.param.b16 %r3, [test_maxnum_param_0];
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
+; SM70-NEXT: max.f32 %r5, %r4, %r2;
+; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1;
+; SM70-NEXT: add.s32 %r7, %r6, %r5;
+; SM70-NEXT: add.s32 %r8, %r7, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r5, %r5;
+; SM70-NEXT: or.b32 %r9, %r5, 4194304;
+; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
+; SM70-NEXT: mov.b32 {_, %rs1}, %r10;
; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; SM70-NEXT: ret;
;
@@ -1477,47 +1477,47 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM70: {
; SM70-NEXT: .reg .pred %p<11>;
; SM70-NEXT: .reg .b16 %rs<15>;
-; SM70-NEXT: .reg .b32 %r<28>;
+; SM70-NEXT: .reg .b32 %r<16>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT: shl.b32 %r22, %r3, 16;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r6, %rs4;
-; SM70-NEXT: shl.b32 %r23, %r6, 16;
-; SM70-NEXT: setp.gt.f32 %p1, %r23, %r22;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: setp.gt.f32 %p1, %r6, %r4;
; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
-; SM70-NEXT: setp.nan.f32 %p2, %r23, %r22;
+; SM70-NEXT: setp.nan.f32 %p2, %r6, %r4;
; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2;
; SM70-NEXT: setp.eq.s16 %p3, %rs4, 0;
; SM70-NEXT: selp.b16 %rs7, %rs4, %rs6, %p3;
; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
; SM70-NEXT: selp.b16 %rs8, %rs2, %rs7, %p4;
-; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
-; SM70-NEXT: shl.b32 %r24, %r9, 16;
-; SM70-NEXT: setp.eq.f32 %p5, %r24, 0f00000000;
+; SM70-NEXT: cvt.u32.u16 %r7, %rs6;
+; SM70-NEXT: shl.b32 %r8, %r7, 16;
+; SM70-NEXT: setp.eq.f32 %p5, %r8, 0f00000000;
; SM70-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5;
-; SM70-NEXT: cvt.u32.u16 %r12, %rs1;
-; SM70-NEXT: shl.b32 %r25, %r12, 16;
-; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
-; SM70-NEXT: shl.b32 %r26, %r15, 16;
-; SM70-NEXT: setp.gt.f32 %p6, %r26, %r25;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs1;
+; SM70-NEXT: shl.b32 %r10, %r9, 16;
+; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
+; SM70-NEXT: shl.b32 %r12, %r11, 16;
+; SM70-NEXT: setp.gt.f32 %p6, %r12, %r10;
; SM70-NEXT: selp.b16 %rs10, %rs3, %rs1, %p6;
-; SM70-NEXT: setp.nan.f32 %p7, %r26, %r25;
+; SM70-NEXT: setp.nan.f32 %p7, %r12, %r10;
; SM70-NEXT: selp.b16 %rs11, 0x7FC0, %rs10, %p7;
; SM70-NEXT: setp.eq.s16 %p8, %rs3, 0;
; SM70-NEXT: selp.b16 %rs12, %rs3, %rs11, %p8;
; SM70-NEXT: setp.eq.s16 %p9, %rs1, 0;
; SM70-NEXT: selp.b16 %rs13, %rs1, %rs12, %p9;
-; SM70-NEXT: cvt.u32.u16 %r18, %rs11;
-; SM70-NEXT: shl.b32 %r27, %r18, 16;
-; SM70-NEXT: setp.eq.f32 %p10, %r27, 0f00000000;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs11;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: setp.eq.f32 %p10, %r14, 0f00000000;
; SM70-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10;
-; SM70-NEXT: mov.b32 %r21, {%rs14, %rs9};
-; SM70-NEXT: st.param.b32 [func_retval0], %r21;
+; SM70-NEXT: mov.b32 %r15, {%rs14, %rs9};
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_maximum_v2(
@@ -1561,37 +1561,37 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<5>;
-; SM70-NEXT: .reg .b32 %r<36>;
+; SM70-NEXT: .reg .b32 %r<24>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0];
; SM70-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1];
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
-; SM70-NEXT: shl.b32 %r30, %r3, 16;
+; SM70-NEXT: shl.b32 %r4, %r3, 16;
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; SM70-NEXT: cvt.u32.u16 %r6, %rs4;
-; SM70-NEXT: shl.b32 %r31, %r6, 16;
-; SM70-NEXT: max.f32 %r32, %r31, %r30;
-; SM70-NEXT: bfe.u32 %r11, %r32, 16, 1;
-; SM70-NEXT: add.s32 %r12, %r11, %r32;
-; SM70-NEXT: add.s32 %r13, %r12, 32767;
-; SM70-NEXT: setp.nan.f32 %p1, %r32, %r32;
-; SM70-NEXT: or.b32 %r14, %r32, 4194304;
-; SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
-; SM70-NEXT: cvt.u32.u16 %r16, %rs1;
-; SM70-NEXT: shl.b32 %r33, %r16, 16;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs3;
-; SM70-NEXT: shl.b32 %r34, %r19, 16;
-; SM70-NEXT: max.f32 %r35, %r34, %r33;
-; SM70-NEXT: bfe.u32 %r24, %r35, 16, 1;
-; SM70-NEXT: add.s32 %r25, %r24, %r35;
-; SM70-NEXT: add.s32 %r26, %r25, 32767;
-; SM70-NEXT: setp.nan.f32 %p2, %r35, %r35;
-; SM70-NEXT: or.b32 %r27, %r35, 4194304;
-; SM70-NEXT: selp.b32 %r28, %r27, %r26, %p2;
-; SM70-NEXT: prmt.b32 %r29, %r28, %r15, 0x7632U;
-; SM70-NEXT: st.param.b32 [func_retval0], %r29;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs4;
+; SM70-NEXT: shl.b32 %r6, %r5, 16;
+; SM70-NEXT: max.f32 %r7, %r6, %r4;
+; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; SM70-NEXT: add.s32 %r9, %r8, %r7;
+; SM70-NEXT: add.s32 %r10, %r9, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: shl.b32 %r14, %r13, 16;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: shl.b32 %r16, %r15, 16;
+; SM70-NEXT: max.f32 %r17, %r16, %r14;
+; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
+; SM70-NEXT: add.s32 %r19, %r18, %r17;
+; SM70-NEXT: add.s32 %r20, %r19, 32767;
+; SM70-NEXT: setp.nan.f32 %p2, %r17, %r17;
+; SM70-NEXT: or.b32 %r21, %r17, 4194304;
+; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
+; SM70-NEXT: prmt.b32 %r23, %r22, %r12, 0x7632U;
+; SM70-NEXT: st.param.b32 [func_retval0], %r23;
; SM70-NEXT: ret;
;
; SM80-LABEL: test_maxnum_v2(
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 3c91fbc9cde56..ec993aa15a85a 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -711,35 +711,35 @@ define <2 x bfloat> @test_round(<2 x bfloat> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<29>;
+; CHECK-NEXT: .reg .b32 %r<21>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NEXT: cvt.f32.bf16 %r25, %rs1;
-; CHECK-NEXT: and.b32 %r4, %r25, -2147483648;
-; CHECK-NEXT: or.b32 %r26, %r4, 1056964608;
-; CHECK-NEXT: add.rn.f32 %r7, %r25, %r26;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r8, %r7;
-; CHECK-NEXT: abs.f32 %r9, %r25;
-; CHECK-NEXT: setp.gt.f32 %p1, %r9, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r10, %r25, %r8, %p1;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r25;
-; CHECK-NEXT: setp.lt.f32 %p2, %r9, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r12, %r11, %r10, %p2;
-; CHECK-NEXT: cvt.f32.bf16 %r27, %rs2;
-; CHECK-NEXT: and.b32 %r15, %r27, -2147483648;
-; CHECK-NEXT: or.b32 %r28, %r15, 1056964608;
-; CHECK-NEXT: add.rn.f32 %r18, %r27, %r28;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r19, %r18;
-; CHECK-NEXT: abs.f32 %r20, %r27;
-; CHECK-NEXT: setp.gt.f32 %p3, %r20, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r21, %r27, %r19, %p3;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r22, %r27;
-; CHECK-NEXT: setp.lt.f32 %p4, %r20, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r23, %r22, %r21, %p4;
-; CHECK-NEXT: cvt.rn.bf16x2.f32 %r24, %r23, %r12;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r24;
+; CHECK-NEXT: cvt.f32.bf16 %r2, %rs1;
+; CHECK-NEXT: and.b32 %r3, %r2, -2147483648;
+; CHECK-NEXT: or.b32 %r4, %r3, 1056964608;
+; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NEXT: abs.f32 %r7, %r2;
+; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000;
+; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2;
+; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000;
+; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2;
+; CHECK-NEXT: cvt.f32.bf16 %r11, %rs2;
+; CHECK-NEXT: and.b32 %r12, %r11, -2147483648;
+; CHECK-NEXT: or.b32 %r13, %r12, 1056964608;
+; CHECK-NEXT: add.rn.f32 %r14, %r11, %r13;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r15, %r14;
+; CHECK-NEXT: abs.f32 %r16, %r11;
+; CHECK-NEXT: setp.gt.f32 %p3, %r16, 0f4B000000;
+; CHECK-NEXT: selp.f32 %r17, %r11, %r15, %p3;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r18, %r11;
+; CHECK-NEXT: setp.lt.f32 %p4, %r16, 0f3F000000;
+; CHECK-NEXT: selp.f32 %r19, %r18, %r17, %p4;
+; CHECK-NEXT: cvt.rn.bf16x2.f32 %r20, %r19, %r10;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r20;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.round.f16(<2 x bfloat> %a)
ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 40f6557bbe1a2..c905fc04ce780 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -1043,8 +1043,7 @@ define half @test_copysign(half %a, half %b) #0 {
; CHECK-LABEL: test_copysign_f32(
; CHECK-DAG: ld.param.b16 [[AH:%rs[0-9]+]], [test_copysign_f32_param_0];
-; CHECK-DAG: ld.param.b32 [[BF:%r[0-9]+]], [test_copysign_f32_param_1];
-; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]];
+; CHECK-DAG: ld.param.b32 [[B:%r[0-9]+]], [test_copysign_f32_param_1];
; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767;
; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648;
; CHECK-DAG: mov.b32 {tmp, [[BX2:%rs[0-9]+]]}, [[BX0]];
@@ -1059,8 +1058,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
; CHECK-LABEL: test_copysign_f64(
; CHECK-DAG: ld.param.b16 [[AH:%rs[0-9]+]], [test_copysign_f64_param_0];
-; CHECK-DAG: ld.param.b64 [[BD:%rd[0-9]+]], [test_copysign_f64_param_1];
-; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]];
+; CHECK-DAG: ld.param.b64 [[B:%rd[0-9]+]], [test_copysign_f64_param_1];
; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767;
; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48;
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 636ca801e97b7..8c89f82dbf9c1 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1612,12 +1612,11 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
; CHECK-LABEL: test_bitcast_float_to_2xhalf(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_float_to_2xhalf_param_0];
-; CHECK-NEXT: mov.b32 %r2, %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%r = bitcast float %a to <2 x half>
ret <2 x half> %r
@@ -1626,12 +1625,11 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 {
; CHECK-LABEL: test_bitcast_2xhalf_to_float(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xhalf_to_float_param_0];
-; CHECK-NEXT: mov.b32 %r2, %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%r = bitcast <2 x half> %a to float
ret float %r
@@ -1961,24 +1959,22 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
; CHECK-NOF16-LABEL: test_copysign_f32(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<9>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<7>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
-; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
-; CHECK-NOF16-NEXT: mov.b32 %r4, %r3;
-; CHECK-NOF16-NEXT: and.b32 %r5, %r4, -2147483648;
-; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r5; }
-; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4;
-; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767;
-; CHECK-NOF16-NEXT: mov.b32 %r6, %r2;
-; CHECK-NOF16-NEXT: and.b32 %r7, %r6, -2147483648;
-; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r7; }
-; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7;
-; CHECK-NOF16-NEXT: mov.b32 %r8, {%rs8, %rs5};
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r8;
+; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648;
+; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; }
+; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1;
+; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767;
+; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1;
+; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648;
+; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; }
+; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767;
+; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6;
+; CHECK-NOF16-NEXT: mov.b32 %r6, {%rs8, %rs5};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-NOF16-NEXT: ret;
%tb = fptrunc <2 x float> %b to <2 x half>
%r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
@@ -2008,23 +2004,21 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
; CHECK-NOF16-NEXT: .reg .b32 %r<3>;
-; CHECK-NOF16-NEXT: .reg .b64 %rd<9>;
+; CHECK-NOF16-NEXT: .reg .b64 %rd<7>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0];
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
-; CHECK-NOF16-NEXT: mov.b64 %rd3, %rd2;
-; CHECK-NOF16-NEXT: and.b64 %rd4, %rd3, -9223372036854775808;
-; CHECK-NOF16-NEXT: shr.u64 %rd5, %rd4, 48;
-; CHECK-NOF16-NEXT: cvt.u16.u64 %rs4, %rd5;
+; CHECK-NOF16-NEXT: and.b64 %rd3, %rd2, -9223372036854775808;
+; CHECK-NOF16-NEXT: shr.u64 %rd4, %rd3, 48;
+; CHECK-NOF16-NEXT: cvt.u16.u64 %rs4, %rd4;
; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4;
; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767;
-; CHECK-NOF16-NEXT: mov.b64 %rd6, %rd1;
-; CHECK-NOF16-NEXT: and.b64 %rd7, %rd6, -9223372036854775808;
-; CHECK-NOF16-NEXT: shr.u64 %rd8, %rd7, 48;
-; CHECK-NOF16-NEXT: cvt.u16.u64 %rs7, %rd8;
+; CHECK-NOF16-NEXT: and.b64 %rd5, %rd1, -9223372036854775808;
+; CHECK-NOF16-NEXT: shr.u64 %rd6, %rd5, 48;
+; CHECK-NOF16-NEXT: cvt.u16.u64 %rs7, %rd6;
; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7;
; CHECK-NOF16-NEXT: mov.b32 %r2, {%rs8, %rs5};
; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r2;
@@ -2191,41 +2185,37 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<25>;
+; CHECK-NEXT: .reg .b32 %r<21>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.f32.f16 %r2, %rs2;
-; CHECK-NEXT: mov.b32 %r3, %r2;
-; CHECK-NEXT: and.b32 %r4, %r3, -2147483648;
-; CHECK-NEXT: or.b32 %r5, %r4, 1056964608;
-; CHECK-NEXT: mov.b32 %r6, %r5;
-; CHECK-NEXT: add.rn.f32 %r7, %r2, %r6;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r8, %r7;
-; CHECK-NEXT: abs.f32 %r9, %r2;
-; CHECK-NEXT: setp.gt.f32 %p1, %r9, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r10, %r2, %r8, %p1;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r2;
-; CHECK-NEXT: setp.lt.f32 %p2, %r9, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r12, %r11, %r10, %p2;
-; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r12;
-; CHECK-NEXT: cvt.f32.f16 %r13, %rs1;
-; CHECK-NEXT: mov.b32 %r14, %r13;
-; CHECK-NEXT: and.b32 %r15, %r14, -2147483648;
-; CHECK-NEXT: or.b32 %r16, %r15, 1056964608;
-; CHECK-NEXT: mov.b32 %r17, %r16;
-; CHECK-NEXT: add.rn.f32 %r18, %r13, %r17;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r19, %r18;
-; CHECK-NEXT: abs.f32 %r20, %r13;
-; CHECK-NEXT: setp.gt.f32 %p3, %r20, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r21, %r13, %r19, %p3;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r22, %r13;
-; CHECK-NEXT: setp.lt.f32 %p4, %r20, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r23, %r22, %r21, %p4;
-; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %r23;
-; CHECK-NEXT: mov.b32 %r24, {%rs4, %rs3};
-; CHECK-NEXT: st.param.b32 [func_retval0], %r24;
+; CHECK-NEXT: and.b32 %r3, %r2, -2147483648;
+; CHECK-NEXT: or.b32 %r4, %r3, 1056964608;
+; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5;
+; CHECK-NEXT: abs.f32 %r7, %r2;
+; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000;
+; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2;
+; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000;
+; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r10;
+; CHECK-NEXT: cvt.f32.f16 %r11, %rs1;
+; CHECK-NEXT: and.b32 %r12, %r11, -2147483648;
+; CHECK-NEXT: or.b32 %r13, %r12, 1056964608;
+; CHECK-NEXT: add.rn.f32 %r14, %r11, %r13;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r15, %r14;
+; CHECK-NEXT: abs.f32 %r16, %r11;
+; CHECK-NEXT: setp.gt.f32 %p3, %r16, 0f4B000000;
+; CHECK-NEXT: selp.f32 %r17, %r11, %r15, %p3;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r18, %r11;
+; CHECK-NEXT: setp.lt.f32 %p4, %r16, 0f3F000000;
+; CHECK-NEXT: selp.f32 %r19, %r18, %r17, %p4;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs4, %r19;
+; CHECK-NEXT: mov.b32 %r20, {%rs4, %rs3};
+; CHECK-NEXT: st.param.b32 [func_retval0], %r20;
; CHECK-NEXT: ret;
%r = call <2 x half> @llvm.round.f16(<2 x half> %a)
ret <2 x half> %r
diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll
index c3212954668e2..ef2a788bb8267 100644
--- a/llvm/test/CodeGen/NVPTX/fexp2.ll
+++ b/llvm/test/CodeGen/NVPTX/fexp2.ll
@@ -254,19 +254,19 @@ define bfloat @exp2_bf16_test(bfloat %in) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b16 %r1, [exp2_bf16_test_param_0];
-; CHECK-NEXT: shl.b32 %r11, %r1, 16;
-; CHECK-NEXT: ex2.approx.f32 %r12, %r11;
-; CHECK-NEXT: bfe.u32 %r6, %r12, 16, 1;
-; CHECK-NEXT: add.s32 %r7, %r6, %r12;
-; CHECK-NEXT: add.s32 %r8, %r7, 32767;
-; CHECK-NEXT: setp.nan.f32 %p1, %r12, %r12;
-; CHECK-NEXT: or.b32 %r9, %r12, 4194304;
-; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
-; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: ex2.approx.f32 %r3, %r2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %r3, %r3;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-NEXT: ret;
;
@@ -274,19 +274,19 @@ define bfloat @exp2_bf16_test(bfloat %in) {
; CHECK-FP16: {
; CHECK-FP16-NEXT: .reg .pred %p<2>;
; CHECK-FP16-NEXT: .reg .b16 %rs<2>;
-; CHECK-FP16-NEXT: .reg .b32 %r<13>;
+; CHECK-FP16-NEXT: .reg .b32 %r<9>;
; CHECK-FP16-EMPTY:
; CHECK-FP16-NEXT: // %bb.0: // %entry
; CHECK-FP16-NEXT: ld.param.b16 %r1, [exp2_bf16_test_param_0];
-; CHECK-FP16-NEXT: shl.b32 %r11, %r1, 16;
-; CHECK-FP16-NEXT: ex2.approx.f32 %r12, %r11;
-; CHECK-FP16-NEXT: bfe.u32 %r6, %r12, 16, 1;
-; CHECK-FP16-NEXT: add.s32 %r7, %r6, %r12;
-; CHECK-FP16-NEXT: add.s32 %r8, %r7, 32767;
-; CHECK-FP16-NEXT: setp.nan.f32 %p1, %r12, %r12;
-; CHECK-FP16-NEXT: or.b32 %r9, %r12, 4194304;
-; CHECK-FP16-NEXT: selp.b32 %r10, %r9, %r8, %p1;
-; CHECK-FP16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
+; CHECK-FP16-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-FP16-NEXT: ex2.approx.f32 %r3, %r2;
+; CHECK-FP16-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-FP16-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-FP16-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-FP16-NEXT: setp.nan.f32 %p1, %r3, %r3;
+; CHECK-FP16-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-FP16-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-FP16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-FP16-NEXT: ret;
;
@@ -310,62 +310,62 @@ define <2 x bfloat> @exp2_bf16_test_v(<2 x bfloat> %in) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-NEXT: .reg .b32 %r<19>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
-; CHECK-NEXT: shl.b32 %r23, %r2, 16;
-; CHECK-NEXT: ex2.approx.f32 %r24, %r23;
-; CHECK-NEXT: bfe.u32 %r7, %r24, 16, 1;
-; CHECK-NEXT: add.s32 %r8, %r7, %r24;
-; CHECK-NEXT: add.s32 %r9, %r8, 32767;
-; CHECK-NEXT: setp.nan.f32 %p1, %r24, %r24;
-; CHECK-NEXT: or.b32 %r10, %r24, 4194304;
-; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p1;
-; CHECK-NEXT: cvt.u32.u16 %r12, %rs1;
-; CHECK-NEXT: shl.b32 %r25, %r12, 16;
-; CHECK-NEXT: ex2.approx.f32 %r26, %r25;
-; CHECK-NEXT: bfe.u32 %r17, %r26, 16, 1;
-; CHECK-NEXT: add.s32 %r18, %r17, %r26;
-; CHECK-NEXT: add.s32 %r19, %r18, 32767;
-; CHECK-NEXT: setp.nan.f32 %p2, %r26, %r26;
-; CHECK-NEXT: or.b32 %r20, %r26, 4194304;
-; CHECK-NEXT: selp.b32 %r21, %r20, %r19, %p2;
-; CHECK-NEXT: prmt.b32 %r22, %r21, %r11, 0x7632U;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r22;
+; CHECK-NEXT: shl.b32 %r3, %r2, 16;
+; CHECK-NEXT: ex2.approx.f32 %r4, %r3;
+; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 1;
+; CHECK-NEXT: add.s32 %r6, %r5, %r4;
+; CHECK-NEXT: add.s32 %r7, %r6, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r4;
+; CHECK-NEXT: or.b32 %r8, %r4, 4194304;
+; CHECK-NEXT: selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-NEXT: ex2.approx.f32 %r12, %r11;
+; CHECK-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; CHECK-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-NEXT: add.s32 %r15, %r14, 32767;
+; CHECK-NEXT: setp.nan.f32 %p2, %r12, %r12;
+; CHECK-NEXT: or.b32 %r16, %r12, 4194304;
+; CHECK-NEXT: selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r18;
; CHECK-NEXT: ret;
;
; CHECK-FP16-LABEL: exp2_bf16_test_v(
; CHECK-FP16: {
; CHECK-FP16-NEXT: .reg .pred %p<3>;
; CHECK-FP16-NEXT: .reg .b16 %rs<3>;
-; CHECK-FP16-NEXT: .reg .b32 %r<27>;
+; CHECK-FP16-NEXT: .reg .b32 %r<19>;
; CHECK-FP16-EMPTY:
; CHECK-FP16-NEXT: // %bb.0: // %entry
; CHECK-FP16-NEXT: ld.param.b32 %r1, [exp2_bf16_test_v_param_0];
; CHECK-FP16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-FP16-NEXT: cvt.u32.u16 %r2, %rs2;
-; CHECK-FP16-NEXT: shl.b32 %r23, %r2, 16;
-; CHECK-FP16-NEXT: ex2.approx.f32 %r24, %r23;
-; CHECK-FP16-NEXT: bfe.u32 %r7, %r24, 16, 1;
-; CHECK-FP16-NEXT: add.s32 %r8, %r7, %r24;
-; CHECK-FP16-NEXT: add.s32 %r9, %r8, 32767;
-; CHECK-FP16-NEXT: setp.nan.f32 %p1, %r24, %r24;
-; CHECK-FP16-NEXT: or.b32 %r10, %r24, 4194304;
-; CHECK-FP16-NEXT: selp.b32 %r11, %r10, %r9, %p1;
-; CHECK-FP16-NEXT: cvt.u32.u16 %r12, %rs1;
-; CHECK-FP16-NEXT: shl.b32 %r25, %r12, 16;
-; CHECK-FP16-NEXT: ex2.approx.f32 %r26, %r25;
-; CHECK-FP16-NEXT: bfe.u32 %r17, %r26, 16, 1;
-; CHECK-FP16-NEXT: add.s32 %r18, %r17, %r26;
-; CHECK-FP16-NEXT: add.s32 %r19, %r18, 32767;
-; CHECK-FP16-NEXT: setp.nan.f32 %p2, %r26, %r26;
-; CHECK-FP16-NEXT: or.b32 %r20, %r26, 4194304;
-; CHECK-FP16-NEXT: selp.b32 %r21, %r20, %r19, %p2;
-; CHECK-FP16-NEXT: prmt.b32 %r22, %r21, %r11, 0x7632U;
-; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r22;
+; CHECK-FP16-NEXT: shl.b32 %r3, %r2, 16;
+; CHECK-FP16-NEXT: ex2.approx.f32 %r4, %r3;
+; CHECK-FP16-NEXT: bfe.u32 %r5, %r4, 16, 1;
+; CHECK-FP16-NEXT: add.s32 %r6, %r5, %r4;
+; CHECK-FP16-NEXT: add.s32 %r7, %r6, 32767;
+; CHECK-FP16-NEXT: setp.nan.f32 %p1, %r4, %r4;
+; CHECK-FP16-NEXT: or.b32 %r8, %r4, 4194304;
+; CHECK-FP16-NEXT: selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-FP16-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-FP16-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-FP16-NEXT: ex2.approx.f32 %r12, %r11;
+; CHECK-FP16-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; CHECK-FP16-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-FP16-NEXT: add.s32 %r15, %r14, 32767;
+; CHECK-FP16-NEXT: setp.nan.f32 %p2, %r12, %r12;
+; CHECK-FP16-NEXT: or.b32 %r16, %r12, 4194304;
+; CHECK-FP16-NEXT: selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-FP16-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r18;
; CHECK-FP16-NEXT: ret;
;
; CHECK-BF16-LABEL: exp2_bf16_test_v(
diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll
index c672af2893da1..7a5b1bb0ddef6 100644
--- a/llvm/test/CodeGen/NVPTX/flog2.ll
+++ b/llvm/test/CodeGen/NVPTX/flog2.ll
@@ -124,19 +124,19 @@ define bfloat @log2_bf16_test(bfloat %in) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b16 %r1, [log2_bf16_test_param_0];
-; CHECK-NEXT: shl.b32 %r11, %r1, 16;
-; CHECK-NEXT: lg2.approx.f32 %r12, %r11;
-; CHECK-NEXT: bfe.u32 %r6, %r12, 16, 1;
-; CHECK-NEXT: add.s32 %r7, %r6, %r12;
-; CHECK-NEXT: add.s32 %r8, %r7, 32767;
-; CHECK-NEXT: setp.nan.f32 %p1, %r12, %r12;
-; CHECK-NEXT: or.b32 %r9, %r12, 4194304;
-; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
-; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: lg2.approx.f32 %r3, %r2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %r3, %r3;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-NEXT: ret;
entry:
@@ -150,19 +150,19 @@ define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<2>;
-; CHECK-NEXT: .reg .b32 %r<13>;
+; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b16 %r1, [log2_bf16_ftz_test_param_0];
-; CHECK-NEXT: shl.b32 %r11, %r1, 16;
-; CHECK-NEXT: lg2.approx.ftz.f32 %r12, %r11;
-; CHECK-NEXT: bfe.u32 %r6, %r12, 16, 1;
-; CHECK-NEXT: add.s32 %r7, %r6, %r12;
-; CHECK-NEXT: add.s32 %r8, %r7, 32767;
-; CHECK-NEXT: setp.nan.ftz.f32 %p1, %r12, %r12;
-; CHECK-NEXT: or.b32 %r9, %r12, 4194304;
-; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
-; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
+; CHECK-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-NEXT: lg2.approx.ftz.f32 %r3, %r2;
+; CHECK-NEXT: bfe.u32 %r4, %r3, 16, 1;
+; CHECK-NEXT: add.s32 %r5, %r4, %r3;
+; CHECK-NEXT: add.s32 %r6, %r5, 32767;
+; CHECK-NEXT: setp.nan.ftz.f32 %p1, %r3, %r3;
+; CHECK-NEXT: or.b32 %r7, %r3, 4194304;
+; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-NEXT: ret;
entry:
@@ -176,31 +176,31 @@ define <2 x bfloat> @log2_bf16_test_v(<2 x bfloat> %in) {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<27>;
+; CHECK-NEXT: .reg .b32 %r<19>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b32 %r1, [log2_bf16_test_v_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs2;
-; CHECK-NEXT: shl.b32 %r23, %r2, 16;
-; CHECK-NEXT: lg2.approx.f32 %r24, %r23;
-; CHECK-NEXT: bfe.u32 %r7, %r24, 16, 1;
-; CHECK-NEXT: add.s32 %r8, %r7, %r24;
-; CHECK-NEXT: add.s32 %r9, %r8, 32767;
-; CHECK-NEXT: setp.nan.f32 %p1, %r24, %r24;
-; CHECK-NEXT: or.b32 %r10, %r24, 4194304;
-; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p1;
-; CHECK-NEXT: cvt.u32.u16 %r12, %rs1;
-; CHECK-NEXT: shl.b32 %r25, %r12, 16;
-; CHECK-NEXT: lg2.approx.f32 %r26, %r25;
-; CHECK-NEXT: bfe.u32 %r17, %r26, 16, 1;
-; CHECK-NEXT: add.s32 %r18, %r17, %r26;
-; CHECK-NEXT: add.s32 %r19, %r18, 32767;
-; CHECK-NEXT: setp.nan.f32 %p2, %r26, %r26;
-; CHECK-NEXT: or.b32 %r20, %r26, 4194304;
-; CHECK-NEXT: selp.b32 %r21, %r20, %r19, %p2;
-; CHECK-NEXT: prmt.b32 %r22, %r21, %r11, 0x7632U;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r22;
+; CHECK-NEXT: shl.b32 %r3, %r2, 16;
+; CHECK-NEXT: lg2.approx.f32 %r4, %r3;
+; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 1;
+; CHECK-NEXT: add.s32 %r6, %r5, %r4;
+; CHECK-NEXT: add.s32 %r7, %r6, 32767;
+; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r4;
+; CHECK-NEXT: or.b32 %r8, %r4, 4194304;
+; CHECK-NEXT: selp.b32 %r9, %r8, %r7, %p1;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-NEXT: lg2.approx.f32 %r12, %r11;
+; CHECK-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; CHECK-NEXT: add.s32 %r14, %r13, %r12;
+; CHECK-NEXT: add.s32 %r15, %r14, 32767;
+; CHECK-NEXT: setp.nan.f32 %p2, %r12, %r12;
+; CHECK-NEXT: or.b32 %r16, %r12, 4194304;
+; CHECK-NEXT: selp.b32 %r17, %r16, %r15, %p2;
+; CHECK-NEXT: prmt.b32 %r18, %r17, %r9, 0x7632U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r18;
; CHECK-NEXT: ret;
entry:
%log2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index c44512281f7be..9c11f169a89df 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -249,25 +249,25 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT: .reg .b32 %r<24>;
+; CHECK-SM70-NEXT: .reg .b32 %r<14>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r19, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_unsafe_with_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r20, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_unsafe_with_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r21, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r20, %r19;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r22, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r22;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r22, %r22;
-; CHECK-SM70-NEXT: or.b32 %r15, %r22, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r23, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r23, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; CHECK-SM70-NEXT: ret;
@@ -307,25 +307,25 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT: .reg .b32 %r<24>;
+; CHECK-SM70-NEXT: .reg .b32 %r<14>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r19, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r20, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r21, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r20, %r19;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r22, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r22;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r22, %r22;
-; CHECK-SM70-NEXT: or.b32 %r15, %r22, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r23, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r23, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; CHECK-SM70-NEXT: ret;
@@ -359,7 +359,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<9>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<15>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<9>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -369,15 +369,15 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000;
; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r12, %r1, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r4, %r12, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %r4;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs6;
-; CHECK-FTZ-NEXT: shl.b32 %r13, %r5, 16;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r8, %rs7;
-; CHECK-FTZ-NEXT: shl.b32 %r14, %r8, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r11, %r13, %r14;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %r11;
+; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r3, %r2, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %r3;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r4, %rs6;
+; CHECK-FTZ-NEXT: shl.b32 %r5, %r4, 16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r6, %rs7;
+; CHECK-FTZ-NEXT: shl.b32 %r7, %r6, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r8, %r5, %r7;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %r8;
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8;
; CHECK-FTZ-NEXT: ret;
;
@@ -385,44 +385,44 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM70-NEXT: .reg .b32 %r<47>;
+; CHECK-SM70-NEXT: .reg .b32 %r<29>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r38, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r39, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r40, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r41, %r40, %r39, %r38;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r41, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r41;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r41, %r41;
-; CHECK-SM70-NEXT: or.b32 %r15, %r41, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r42, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r42, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT: add.f32 %r43, %r42, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r43, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r43;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r43, %r43;
-; CHECK-SM70-NEXT: or.b32 %r24, %r43, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p3;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r44, %r26, 16;
-; CHECK-SM70-NEXT: and.b32 %r45, %r25, -65536;
-; CHECK-SM70-NEXT: add.f32 %r46, %r44, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r33, %r46, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r34, %r33, %r46;
-; CHECK-SM70-NEXT: add.s32 %r35, %r34, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r46, %r46;
-; CHECK-SM70-NEXT: or.b32 %r36, %r46, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r37, %r36, %r35, %p4;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r37; }
+; CHECK-SM70-NEXT: add.f32 %r14, %r13, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536;
+; CHECK-SM70-NEXT: add.f32 %r23, %r21, %r22;
+; CHECK-SM70-NEXT: bfe.u32 %r24, %r23, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, %r23;
+; CHECK-SM70-NEXT: add.s32 %r26, %r25, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r23, %r23;
+; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3;
; CHECK-SM70-NEXT: ret;
%1 = fmul bfloat %a, %b
@@ -463,31 +463,31 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT: .reg .b32 %r<32>;
+; CHECK-SM70-NEXT: .reg .b32 %r<20>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r26, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r27, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r28, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r29, %r28, %r27, %r26;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r29, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r29;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r29, %r29;
-; CHECK-SM70-NEXT: or.b32 %r15, %r29, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: and.b32 %r30, %r16, -65536;
-; CHECK-SM70-NEXT: max.f32 %r31, %r30, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r31, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r31;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r31, %r31;
-; CHECK-SM70-NEXT: or.b32 %r24, %r31, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: max.f32 %r14, %r13, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-SM70-NEXT: ret;
%1 = fmul bfloat %a, %b
@@ -753,7 +753,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<51>;
+; CHECK-SM70-NEXT: .reg .b32 %r<31>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_unsafe_with_nans_param_0];
@@ -761,43 +761,43 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_unsafe_with_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r41, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r42, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r43, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r44, %r43, %r42, %r41;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r44, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r44;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r44, %r44;
-; CHECK-SM70-NEXT: or.b32 %r18, %r44, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r45, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r46, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r47, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r48, %r47, %r46, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r48, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r48;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r48, %r48;
-; CHECK-SM70-NEXT: or.b32 %r34, %r48, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r49, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r49, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r50, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r50, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: mov.b32 %r40, {%rs10, %rs9};
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r40;
+; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9};
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30;
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
@@ -835,7 +835,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<51>;
+; CHECK-SM70-NEXT: .reg .b32 %r<31>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0];
@@ -843,43 +843,43 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r41, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r42, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r43, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r44, %r43, %r42, %r41;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r44, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r44;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r44, %r44;
-; CHECK-SM70-NEXT: or.b32 %r18, %r44, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r45, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r46, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r47, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r48, %r47, %r46, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r48, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r48;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r48, %r48;
-; CHECK-SM70-NEXT: or.b32 %r34, %r48, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r49, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r49, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r50, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r50, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: mov.b32 %r40, {%rs10, %rs9};
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r40;
+; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9};
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30;
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
@@ -911,7 +911,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<36>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<24>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -922,33 +922,33 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5;
; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs2;
-; CHECK-FTZ-NEXT: shl.b32 %r30, %r7, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r10, %r30, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r10;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs1;
-; CHECK-FTZ-NEXT: shl.b32 %r31, %r11, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r14, %r31, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r14;
+; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r9, %r8, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r9;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-FTZ-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r12, %r11, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r12;
; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs5;
-; CHECK-FTZ-NEXT: shl.b32 %r32, %r15, 16;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r18, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r33, %r18, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r21, %r32, %r33;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r22, %rs6;
-; CHECK-FTZ-NEXT: shl.b32 %r34, %r22, 16;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r25, %rs3;
-; CHECK-FTZ-NEXT: shl.b32 %r35, %r25, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r28, %r34, %r35;
-; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r29, %r28, %r21;
-; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r29;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs5;
+; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs4;
+; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r17, %r14, %r16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r18, %rs6;
+; CHECK-FTZ-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r20, %rs3;
+; CHECK-FTZ-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r22, %r19, %r21;
+; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r23, %r22, %r17;
+; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r23;
; CHECK-FTZ-NEXT: ret;
;
; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<9>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<97>;
+; CHECK-SM70-NEXT: .reg .b32 %r<61>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -956,77 +956,77 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r79, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r80, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r81, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r82, %r81, %r80, %r79;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r82, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r82;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r82, %r82;
-; CHECK-SM70-NEXT: or.b32 %r18, %r82, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r83, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r84, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r85, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r86, %r85, %r84, %r83;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r86, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r86;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r86, %r86;
-; CHECK-SM70-NEXT: or.b32 %r34, %r86, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r87, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r87, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r88, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r88, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: add.f32 %r89, %r88, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r42, %r89, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r43, %r42, %r89;
-; CHECK-SM70-NEXT: add.s32 %r44, %r43, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r89, %r89;
-; CHECK-SM70-NEXT: or.b32 %r45, %r89, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r46, %r45, %r44, %p5;
-; CHECK-SM70-NEXT: add.f32 %r90, %r87, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r90, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r90;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r90, %r90;
-; CHECK-SM70-NEXT: or.b32 %r52, %r90, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p6;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r54, %rs10;
-; CHECK-SM70-NEXT: shl.b32 %r91, %r54, 16;
-; CHECK-SM70-NEXT: and.b32 %r92, %r53, -65536;
-; CHECK-SM70-NEXT: add.f32 %r93, %r91, %r92;
-; CHECK-SM70-NEXT: bfe.u32 %r61, %r93, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r62, %r61, %r93;
-; CHECK-SM70-NEXT: add.s32 %r63, %r62, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p7, %r93, %r93;
-; CHECK-SM70-NEXT: or.b32 %r64, %r93, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r65, %r64, %r63, %p7;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r66, %rs9;
-; CHECK-SM70-NEXT: shl.b32 %r94, %r66, 16;
-; CHECK-SM70-NEXT: and.b32 %r95, %r46, -65536;
-; CHECK-SM70-NEXT: add.f32 %r96, %r94, %r95;
-; CHECK-SM70-NEXT: bfe.u32 %r73, %r96, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r74, %r73, %r96;
-; CHECK-SM70-NEXT: add.s32 %r75, %r74, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p8, %r96, %r96;
-; CHECK-SM70-NEXT: or.b32 %r76, %r96, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r77, %r76, %r75, %p8;
-; CHECK-SM70-NEXT: prmt.b32 %r78, %r77, %r65, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r78;
+; CHECK-SM70-NEXT: add.f32 %r30, %r29, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r30;
+; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r30, %r30;
+; CHECK-SM70-NEXT: or.b32 %r34, %r30, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p5;
+; CHECK-SM70-NEXT: add.f32 %r36, %r28, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10;
+; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16;
+; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536;
+; CHECK-SM70-NEXT: add.f32 %r45, %r43, %r44;
+; CHECK-SM70-NEXT: bfe.u32 %r46, %r45, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r47, %r46, %r45;
+; CHECK-SM70-NEXT: add.s32 %r48, %r47, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p7, %r45, %r45;
+; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9;
+; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16;
+; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536;
+; CHECK-SM70-NEXT: add.f32 %r54, %r52, %r53;
+; CHECK-SM70-NEXT: bfe.u32 %r55, %r54, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r56, %r55, %r54;
+; CHECK-SM70-NEXT: add.s32 %r57, %r56, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p8, %r54, %r54;
+; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8;
+; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60;
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
@@ -1066,7 +1066,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT: .reg .b32 %r<67>;
+; CHECK-SM70-NEXT: .reg .b32 %r<43>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
@@ -1074,51 +1074,51 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r55, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r56, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r57, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r58, %r57, %r56, %r55;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r58, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r58;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r58, %r58;
-; CHECK-SM70-NEXT: or.b32 %r18, %r58, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r59, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r60, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r61, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r62, %r61, %r60, %r59;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r62, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r62;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r62, %r62;
-; CHECK-SM70-NEXT: or.b32 %r34, %r62, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: and.b32 %r63, %r35, -65536;
-; CHECK-SM70-NEXT: max.f32 %r64, %r63, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r40, %r64, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r41, %r40, %r64;
-; CHECK-SM70-NEXT: add.s32 %r42, %r41, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r64, %r64;
-; CHECK-SM70-NEXT: or.b32 %r43, %r64, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r44, %r43, %r42, %p3;
-; CHECK-SM70-NEXT: and.b32 %r65, %r19, -65536;
-; CHECK-SM70-NEXT: max.f32 %r66, %r65, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r66, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r66;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r66, %r66;
-; CHECK-SM70-NEXT: or.b32 %r52, %r66, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p4;
-; CHECK-SM70-NEXT: prmt.b32 %r54, %r53, %r44, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r54;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536;
+; CHECK-SM70-NEXT: max.f32 %r29, %r28, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r29, %r29;
+; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
+; CHECK-SM70-NEXT: max.f32 %r36, %r35, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
+; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
; CHECK-SM70-NEXT: ret;
%1 = fmul <2 x bfloat> %a, %b
%2 = fadd <2 x bfloat> %1, %c
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index ec12f3d44b5d0..c725b797526a3 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -183,25 +183,25 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT: .reg .b32 %r<24>;
+; CHECK-SM70-NEXT: .reg .b32 %r<14>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r19, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r20, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r21, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r20, %r19;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r22, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r22;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r22, %r22;
-; CHECK-SM70-NEXT: or.b32 %r15, %r22, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r23, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r23, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; CHECK-SM70-NEXT: ret;
@@ -232,7 +232,7 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<11>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<7>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
@@ -240,13 +240,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r9, %r1, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r4, %r9, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %r4;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs5;
-; CHECK-FTZ-NEXT: shl.b32 %r10, %r5, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r8, %r10, %r9;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %r8;
+; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r3, %r2, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %r3;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r4, %rs5;
+; CHECK-FTZ-NEXT: shl.b32 %r5, %r4, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r6, %r5, %r2;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %r6;
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6;
; CHECK-FTZ-NEXT: ret;
;
@@ -254,39 +254,39 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<4>;
; CHECK-SM70-NEXT: .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT: .reg .b32 %r<43>;
+; CHECK-SM70-NEXT: .reg .b32 %r<27>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r35, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r36, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r37, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r38, %r37, %r36, %r35;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r38, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r38;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r38, %r38;
-; CHECK-SM70-NEXT: or.b32 %r15, %r38, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: and.b32 %r39, %r16, -65536;
-; CHECK-SM70-NEXT: add.f32 %r40, %r39, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r40, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r40;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r40, %r40;
-; CHECK-SM70-NEXT: or.b32 %r24, %r40, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p2;
-; CHECK-SM70-NEXT: and.b32 %r41, %r25, -65536;
-; CHECK-SM70-NEXT: add.f32 %r42, %r41, %r39;
-; CHECK-SM70-NEXT: bfe.u32 %r30, %r42, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r42;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r42, %r42;
-; CHECK-SM70-NEXT: or.b32 %r33, %r42, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r34; }
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: add.f32 %r14, %r13, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: and.b32 %r20, %r19, -65536;
+; CHECK-SM70-NEXT: add.f32 %r21, %r20, %r13;
+; CHECK-SM70-NEXT: bfe.u32 %r22, %r21, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r23, %r22, %r21;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r21, %r21;
+; CHECK-SM70-NEXT: or.b32 %r25, %r21, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r26, %r25, %r24, %p3;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-SM70-NEXT: ret;
%1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -326,31 +326,31 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT: .reg .b32 %r<32>;
+; CHECK-SM70-NEXT: .reg .b32 %r<20>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r26, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r27, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r28, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r29, %r28, %r27, %r26;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r29, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r29;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r29, %r29;
-; CHECK-SM70-NEXT: or.b32 %r15, %r29, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: and.b32 %r30, %r16, -65536;
-; CHECK-SM70-NEXT: max.f32 %r31, %r30, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r31, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r31;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r31, %r31;
-; CHECK-SM70-NEXT: or.b32 %r24, %r31, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: max.f32 %r14, %r13, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-SM70-NEXT: ret;
%1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -541,7 +541,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<51>;
+; CHECK-SM70-NEXT: .reg .b32 %r<31>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_param_0];
@@ -549,43 +549,43 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r41, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r42, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r43, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r44, %r43, %r42, %r41;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r44, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r44;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r44, %r44;
-; CHECK-SM70-NEXT: or.b32 %r18, %r44, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r45, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r46, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r47, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r48, %r47, %r46, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r48, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r48;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r48, %r48;
-; CHECK-SM70-NEXT: or.b32 %r34, %r48, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r49, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r49, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r50, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r50, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: mov.b32 %r40, {%rs10, %rs9};
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r40;
+; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9};
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30;
; CHECK-SM70-NEXT: ret;
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
%2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -614,7 +614,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<26>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<18>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
@@ -623,28 +623,28 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1;
; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs2;
-; CHECK-FTZ-NEXT: shl.b32 %r22, %r5, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r8, %r22, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r8;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1;
-; CHECK-FTZ-NEXT: shl.b32 %r23, %r9, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r12, %r23, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r12;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r24, %r13, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r16, %r24, %r23;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3;
-; CHECK-FTZ-NEXT: shl.b32 %r25, %r17, 16;
-; CHECK-FTZ-NEXT: add.ftz.f32 %r20, %r25, %r22;
-; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r21, %r20, %r16;
-; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r21;
+; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r7, %r6, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r7;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r8, %rs1;
+; CHECK-FTZ-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r10, %r9, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r10;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs4;
+; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r13, %r12, %r9;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r14, %rs3;
+; CHECK-FTZ-NEXT: shl.b32 %r15, %r14, 16;
+; CHECK-FTZ-NEXT: add.ftz.f32 %r16, %r15, %r6;
+; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r17, %r16, %r13;
+; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-FTZ-NEXT: ret;
;
; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<7>;
; CHECK-SM70-NEXT: .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT: .reg .b32 %r<89>;
+; CHECK-SM70-NEXT: .reg .b32 %r<57>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
@@ -652,67 +652,67 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r73, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r74, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r75, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r76, %r75, %r74, %r73;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r76, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r76;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r76, %r76;
-; CHECK-SM70-NEXT: or.b32 %r18, %r76, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r77, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r78, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r79, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r80, %r79, %r78, %r77;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r80, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r80;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r80, %r80;
-; CHECK-SM70-NEXT: or.b32 %r34, %r80, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: and.b32 %r81, %r35, -65536;
-; CHECK-SM70-NEXT: add.f32 %r82, %r81, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r40, %r82, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r41, %r40, %r82;
-; CHECK-SM70-NEXT: add.s32 %r42, %r41, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r82, %r82;
-; CHECK-SM70-NEXT: or.b32 %r43, %r82, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r44, %r43, %r42, %p3;
-; CHECK-SM70-NEXT: and.b32 %r83, %r19, -65536;
-; CHECK-SM70-NEXT: add.f32 %r84, %r83, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r84, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r84;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r84, %r84;
-; CHECK-SM70-NEXT: or.b32 %r52, %r84, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p4;
-; CHECK-SM70-NEXT: and.b32 %r85, %r53, -65536;
-; CHECK-SM70-NEXT: add.f32 %r86, %r85, %r83;
-; CHECK-SM70-NEXT: bfe.u32 %r58, %r86, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r59, %r58, %r86;
-; CHECK-SM70-NEXT: add.s32 %r60, %r59, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r86, %r86;
-; CHECK-SM70-NEXT: or.b32 %r61, %r86, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r62, %r61, %r60, %p5;
-; CHECK-SM70-NEXT: and.b32 %r87, %r44, -65536;
-; CHECK-SM70-NEXT: add.f32 %r88, %r87, %r81;
-; CHECK-SM70-NEXT: bfe.u32 %r67, %r88, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r68, %r67, %r88;
-; CHECK-SM70-NEXT: add.s32 %r69, %r68, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r88, %r88;
-; CHECK-SM70-NEXT: or.b32 %r70, %r88, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r71, %r70, %r69, %p6;
-; CHECK-SM70-NEXT: prmt.b32 %r72, %r71, %r62, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r72;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536;
+; CHECK-SM70-NEXT: add.f32 %r29, %r28, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r29, %r29;
+; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
+; CHECK-SM70-NEXT: add.f32 %r36, %r35, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
+; CHECK-SM70-NEXT: and.b32 %r42, %r41, -65536;
+; CHECK-SM70-NEXT: add.f32 %r43, %r42, %r35;
+; CHECK-SM70-NEXT: bfe.u32 %r44, %r43, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r45, %r44, %r43;
+; CHECK-SM70-NEXT: add.s32 %r46, %r45, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r43, %r43;
+; CHECK-SM70-NEXT: or.b32 %r47, %r43, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r48, %r47, %r46, %p5;
+; CHECK-SM70-NEXT: and.b32 %r49, %r34, -65536;
+; CHECK-SM70-NEXT: add.f32 %r50, %r49, %r28;
+; CHECK-SM70-NEXT: bfe.u32 %r51, %r50, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r52, %r51, %r50;
+; CHECK-SM70-NEXT: add.s32 %r53, %r52, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r50, %r50;
+; CHECK-SM70-NEXT: or.b32 %r54, %r50, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r55, %r54, %r53, %p6;
+; CHECK-SM70-NEXT: prmt.b32 %r56, %r55, %r48, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r56;
; CHECK-SM70-NEXT: ret;
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
%2 = fcmp ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -751,7 +751,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT: .reg .b32 %r<67>;
+; CHECK-SM70-NEXT: .reg .b32 %r<43>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_0];
@@ -759,51 +759,51 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r55, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r56, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r57, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r58, %r57, %r56, %r55;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r58, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r58;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r58, %r58;
-; CHECK-SM70-NEXT: or.b32 %r18, %r58, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r59, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r60, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r61, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r62, %r61, %r60, %r59;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r62, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r62;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r62, %r62;
-; CHECK-SM70-NEXT: or.b32 %r34, %r62, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: and.b32 %r63, %r35, -65536;
-; CHECK-SM70-NEXT: max.f32 %r64, %r63, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r40, %r64, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r41, %r40, %r64;
-; CHECK-SM70-NEXT: add.s32 %r42, %r41, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r64, %r64;
-; CHECK-SM70-NEXT: or.b32 %r43, %r64, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r44, %r43, %r42, %p3;
-; CHECK-SM70-NEXT: and.b32 %r65, %r19, -65536;
-; CHECK-SM70-NEXT: max.f32 %r66, %r65, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r66, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r66;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r66, %r66;
-; CHECK-SM70-NEXT: or.b32 %r52, %r66, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p4;
-; CHECK-SM70-NEXT: prmt.b32 %r54, %r53, %r44, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r54;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536;
+; CHECK-SM70-NEXT: max.f32 %r29, %r28, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r29, %r29;
+; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
+; CHECK-SM70-NEXT: max.f32 %r36, %r35, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
+; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
; CHECK-SM70-NEXT: ret;
%1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
%2 = call <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index 3d70686951fee..6b462f8468596 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -194,25 +194,25 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) {
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT: .reg .b32 %r<24>;
+; CHECK-SM70-NEXT: .reg .b32 %r<14>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r19, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r20, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r21, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r20, %r19;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r22, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r22;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r22, %r22;
-; CHECK-SM70-NEXT: or.b32 %r15, %r22, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r23, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r23, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; CHECK-SM70-NEXT: ret;
@@ -246,7 +246,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<9>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<15>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<9>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -256,15 +256,15 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-FTZ-NEXT: mov.b16 %rs5, 0x0000;
; CHECK-FTZ-NEXT: max.bf16 %rs6, %rs4, %rs5;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r12, %r1, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r4, %r12, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %r4;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs6;
-; CHECK-FTZ-NEXT: shl.b32 %r13, %r5, 16;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r8, %rs7;
-; CHECK-FTZ-NEXT: shl.b32 %r14, %r8, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r11, %r13, %r14;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %r11;
+; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs7, %r3;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r4, %rs6;
+; CHECK-FTZ-NEXT: shl.b32 %r5, %r4, 16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r6, %rs7;
+; CHECK-FTZ-NEXT: shl.b32 %r7, %r6, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r8, %r5, %r7;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs8, %r8;
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs8;
; CHECK-FTZ-NEXT: ret;
;
@@ -272,44 +272,44 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<4>;
-; CHECK-SM70-NEXT: .reg .b32 %r<47>;
+; CHECK-SM70-NEXT: .reg .b32 %r<29>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r38, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r39, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r40, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r41, %r40, %r39, %r38;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r41, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r41;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r41, %r41;
-; CHECK-SM70-NEXT: or.b32 %r15, %r41, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r42, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r42, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT: add.rn.f32 %r43, %r42, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r43, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r43;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r43, %r43;
-; CHECK-SM70-NEXT: or.b32 %r24, %r43, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p3;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r44, %r26, 16;
-; CHECK-SM70-NEXT: and.b32 %r45, %r25, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r46, %r44, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r33, %r46, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r34, %r33, %r46;
-; CHECK-SM70-NEXT: add.s32 %r35, %r34, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r46, %r46;
-; CHECK-SM70-NEXT: or.b32 %r36, %r46, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r37, %r36, %r35, %p4;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r37; }
+; CHECK-SM70-NEXT: add.rn.f32 %r14, %r13, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p3;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: and.b32 %r22, %r19, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r23, %r21, %r22;
+; CHECK-SM70-NEXT: bfe.u32 %r24, %r23, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, %r23;
+; CHECK-SM70-NEXT: add.s32 %r26, %r25, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r23, %r23;
+; CHECK-SM70-NEXT: or.b32 %r27, %r23, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r28, %r27, %r26, %p4;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs3;
; CHECK-SM70-NEXT: ret;
%1 = fmul fast bfloat %a, %b
@@ -352,31 +352,31 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT: .reg .b32 %r<32>;
+; CHECK-SM70-NEXT: .reg .b32 %r<20>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r26, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_expanded_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r27, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_expanded_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r28, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r29, %r28, %r27, %r26;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r29, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r29;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r29, %r29;
-; CHECK-SM70-NEXT: or.b32 %r15, %r29, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: and.b32 %r30, %r16, -65536;
-; CHECK-SM70-NEXT: max.f32 %r31, %r30, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r31, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r31;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r31, %r31;
-; CHECK-SM70-NEXT: or.b32 %r24, %r31, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: max.f32 %r14, %r13, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-SM70-NEXT: ret;
%1 = fmul fast bfloat %a, %b
@@ -587,7 +587,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<51>;
+; CHECK-SM70-NEXT: .reg .b32 %r<31>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_param_0];
@@ -595,43 +595,43 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r41, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r42, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r43, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r44, %r43, %r42, %r41;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r44, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r44;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r44, %r44;
-; CHECK-SM70-NEXT: or.b32 %r18, %r44, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r45, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r46, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r47, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r48, %r47, %r46, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r48, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r48;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r48, %r48;
-; CHECK-SM70-NEXT: or.b32 %r34, %r48, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r49, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r49, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r50, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r50, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: mov.b32 %r40, {%rs10, %rs9};
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r40;
+; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9};
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30;
; CHECK-SM70-NEXT: ret;
%1 = fmul fast <2 x bfloat> %a, %b
%2 = fadd fast <2 x bfloat> %1, %c
@@ -663,7 +663,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<36>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<24>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
@@ -674,33 +674,33 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-FTZ-NEXT: max.bf16x2 %r6, %r4, %r5;
; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r7, %rs2;
-; CHECK-FTZ-NEXT: shl.b32 %r30, %r7, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r10, %r30, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r10;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs1;
-; CHECK-FTZ-NEXT: shl.b32 %r31, %r11, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r14, %r31, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r14;
+; CHECK-FTZ-NEXT: shl.b32 %r8, %r7, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r9, %r8, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r9;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r10, %rs1;
+; CHECK-FTZ-NEXT: shl.b32 %r11, %r10, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r12, %r11, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r12;
; CHECK-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r6;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs5;
-; CHECK-FTZ-NEXT: shl.b32 %r32, %r15, 16;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r18, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r33, %r18, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r21, %r32, %r33;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r22, %rs6;
-; CHECK-FTZ-NEXT: shl.b32 %r34, %r22, 16;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r25, %rs3;
-; CHECK-FTZ-NEXT: shl.b32 %r35, %r25, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r28, %r34, %r35;
-; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r29, %r28, %r21;
-; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r29;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs5;
+; CHECK-FTZ-NEXT: shl.b32 %r14, %r13, 16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r15, %rs4;
+; CHECK-FTZ-NEXT: shl.b32 %r16, %r15, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r17, %r14, %r16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r18, %rs6;
+; CHECK-FTZ-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r20, %rs3;
+; CHECK-FTZ-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r22, %r19, %r21;
+; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r23, %r22, %r17;
+; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r23;
; CHECK-FTZ-NEXT: ret;
;
; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<9>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<97>;
+; CHECK-SM70-NEXT: .reg .b32 %r<61>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_0];
@@ -708,77 +708,77 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_no_nans_multiple_uses_of_fma_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r79, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r80, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r81, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r82, %r81, %r80, %r79;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r82, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r82;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r82, %r82;
-; CHECK-SM70-NEXT: or.b32 %r18, %r82, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r83, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r84, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r85, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r86, %r85, %r84, %r83;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r86, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r86;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r86, %r86;
-; CHECK-SM70-NEXT: or.b32 %r34, %r86, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r87, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r87, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r88, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r88, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: add.rn.f32 %r89, %r88, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r42, %r89, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r43, %r42, %r89;
-; CHECK-SM70-NEXT: add.s32 %r44, %r43, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r89, %r89;
-; CHECK-SM70-NEXT: or.b32 %r45, %r89, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r46, %r45, %r44, %p5;
-; CHECK-SM70-NEXT: add.rn.f32 %r90, %r87, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r90, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r90;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r90, %r90;
-; CHECK-SM70-NEXT: or.b32 %r52, %r90, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p6;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r54, %rs10;
-; CHECK-SM70-NEXT: shl.b32 %r91, %r54, 16;
-; CHECK-SM70-NEXT: and.b32 %r92, %r53, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r93, %r91, %r92;
-; CHECK-SM70-NEXT: bfe.u32 %r61, %r93, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r62, %r61, %r93;
-; CHECK-SM70-NEXT: add.s32 %r63, %r62, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p7, %r93, %r93;
-; CHECK-SM70-NEXT: or.b32 %r64, %r93, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r65, %r64, %r63, %p7;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r66, %rs9;
-; CHECK-SM70-NEXT: shl.b32 %r94, %r66, 16;
-; CHECK-SM70-NEXT: and.b32 %r95, %r46, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r96, %r94, %r95;
-; CHECK-SM70-NEXT: bfe.u32 %r73, %r96, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r74, %r73, %r96;
-; CHECK-SM70-NEXT: add.s32 %r75, %r74, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p8, %r96, %r96;
-; CHECK-SM70-NEXT: or.b32 %r76, %r96, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r77, %r76, %r75, %p8;
-; CHECK-SM70-NEXT: prmt.b32 %r78, %r77, %r65, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r78;
+; CHECK-SM70-NEXT: add.rn.f32 %r30, %r29, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r31, %r30, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r30;
+; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r30, %r30;
+; CHECK-SM70-NEXT: or.b32 %r34, %r30, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p5;
+; CHECK-SM70-NEXT: add.rn.f32 %r36, %r28, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p6;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r42, %rs10;
+; CHECK-SM70-NEXT: shl.b32 %r43, %r42, 16;
+; CHECK-SM70-NEXT: and.b32 %r44, %r41, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r45, %r43, %r44;
+; CHECK-SM70-NEXT: bfe.u32 %r46, %r45, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r47, %r46, %r45;
+; CHECK-SM70-NEXT: add.s32 %r48, %r47, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p7, %r45, %r45;
+; CHECK-SM70-NEXT: or.b32 %r49, %r45, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r50, %r49, %r48, %p7;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r51, %rs9;
+; CHECK-SM70-NEXT: shl.b32 %r52, %r51, 16;
+; CHECK-SM70-NEXT: and.b32 %r53, %r35, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r54, %r52, %r53;
+; CHECK-SM70-NEXT: bfe.u32 %r55, %r54, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r56, %r55, %r54;
+; CHECK-SM70-NEXT: add.s32 %r57, %r56, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p8, %r54, %r54;
+; CHECK-SM70-NEXT: or.b32 %r58, %r54, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r59, %r58, %r57, %p8;
+; CHECK-SM70-NEXT: prmt.b32 %r60, %r59, %r50, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r60;
; CHECK-SM70-NEXT: ret;
%1 = fmul fast <2 x bfloat> %a, %b
%2 = fadd fast <2 x bfloat> %1, %c
@@ -818,7 +818,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT: .reg .b32 %r<67>;
+; CHECK-SM70-NEXT: .reg .b32 %r<43>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_expanded_maxnum_no_nans_param_0];
@@ -826,51 +826,51 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_expanded_maxnum_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r55, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r56, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r57, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r58, %r57, %r56, %r55;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r58, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r58;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r58, %r58;
-; CHECK-SM70-NEXT: or.b32 %r18, %r58, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r59, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r60, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r61, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r62, %r61, %r60, %r59;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r62, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r62;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r62, %r62;
-; CHECK-SM70-NEXT: or.b32 %r34, %r62, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: and.b32 %r63, %r35, -65536;
-; CHECK-SM70-NEXT: max.f32 %r64, %r63, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r40, %r64, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r41, %r40, %r64;
-; CHECK-SM70-NEXT: add.s32 %r42, %r41, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r64, %r64;
-; CHECK-SM70-NEXT: or.b32 %r43, %r64, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r44, %r43, %r42, %p3;
-; CHECK-SM70-NEXT: and.b32 %r65, %r19, -65536;
-; CHECK-SM70-NEXT: max.f32 %r66, %r65, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r66, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r66;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r66, %r66;
-; CHECK-SM70-NEXT: or.b32 %r52, %r66, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p4;
-; CHECK-SM70-NEXT: prmt.b32 %r54, %r53, %r44, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r54;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536;
+; CHECK-SM70-NEXT: max.f32 %r29, %r28, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r29, %r29;
+; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
+; CHECK-SM70-NEXT: max.f32 %r36, %r35, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
+; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
; CHECK-SM70-NEXT: ret;
%1 = fmul fast <2 x bfloat> %a, %b
%2 = fadd fast <2 x bfloat> %1, %c
@@ -1052,25 +1052,25 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) {
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<3>;
-; CHECK-SM70-NEXT: .reg .b32 %r<24>;
+; CHECK-SM70-NEXT: .reg .b32 %r<14>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r19, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r20, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r21, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r20, %r19;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r22, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r22;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r22, %r22;
-; CHECK-SM70-NEXT: or.b32 %r15, %r22, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r16; }
-; CHECK-SM70-NEXT: and.b32 %r23, %r16, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r23, 0f00000000;
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r12; }
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p2, %r13, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs2, %rs1, 0x0000, %p2;
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs2;
; CHECK-SM70-NEXT: ret;
@@ -1101,7 +1101,7 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<7>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<11>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<7>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b16 %rs1, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
@@ -1109,13 +1109,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
; CHECK-FTZ-NEXT: ld.param.b16 %rs3, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
; CHECK-FTZ-NEXT: fma.rn.bf16 %rs4, %rs1, %rs2, %rs3;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r9, %r1, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r4, %r9, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %r4;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs5;
-; CHECK-FTZ-NEXT: shl.b32 %r10, %r5, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r8, %r10, %r9;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %r8;
+; CHECK-FTZ-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %r3;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r4, %rs5;
+; CHECK-FTZ-NEXT: shl.b32 %r5, %r4, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r2;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %r6;
; CHECK-FTZ-NEXT: st.param.b16 [func_retval0], %rs6;
; CHECK-FTZ-NEXT: ret;
;
@@ -1123,39 +1123,39 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<4>;
; CHECK-SM70-NEXT: .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT: .reg .b32 %r<43>;
+; CHECK-SM70-NEXT: .reg .b32 %r<27>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r35, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r36, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r37, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r38, %r37, %r36, %r35;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r38, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r38;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r38, %r38;
-; CHECK-SM70-NEXT: or.b32 %r15, %r38, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: and.b32 %r39, %r16, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r40, %r39, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r40, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r40;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r40, %r40;
-; CHECK-SM70-NEXT: or.b32 %r24, %r40, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p2;
-; CHECK-SM70-NEXT: and.b32 %r41, %r25, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r42, %r41, %r39;
-; CHECK-SM70-NEXT: bfe.u32 %r30, %r42, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r42;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r42, %r42;
-; CHECK-SM70-NEXT: or.b32 %r33, %r42, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r34; }
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r14, %r13, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: and.b32 %r20, %r19, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r21, %r20, %r13;
+; CHECK-SM70-NEXT: bfe.u32 %r22, %r21, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r23, %r22, %r21;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r21, %r21;
+; CHECK-SM70-NEXT: or.b32 %r25, %r21, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r26, %r25, %r24, %p3;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r26; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-SM70-NEXT: ret;
%1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -1195,31 +1195,31 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) {
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<3>;
; CHECK-SM70-NEXT: .reg .b16 %rs<2>;
-; CHECK-SM70-NEXT: .reg .b32 %r<32>;
+; CHECK-SM70-NEXT: .reg .b32 %r<20>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
-; CHECK-SM70-NEXT: shl.b32 %r26, %r1, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r4, [fma_bf16_maxnum_no_nans_param_1];
-; CHECK-SM70-NEXT: shl.b32 %r27, %r4, 16;
-; CHECK-SM70-NEXT: ld.param.b16 %r7, [fma_bf16_maxnum_no_nans_param_0];
-; CHECK-SM70-NEXT: shl.b32 %r28, %r7, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r29, %r28, %r27, %r26;
-; CHECK-SM70-NEXT: bfe.u32 %r12, %r29, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r13, %r12, %r29;
-; CHECK-SM70-NEXT: add.s32 %r14, %r13, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r29, %r29;
-; CHECK-SM70-NEXT: or.b32 %r15, %r29, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r16, %r15, %r14, %p1;
-; CHECK-SM70-NEXT: and.b32 %r30, %r16, -65536;
-; CHECK-SM70-NEXT: max.f32 %r31, %r30, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r21, %r31, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r22, %r21, %r31;
-; CHECK-SM70-NEXT: add.s32 %r23, %r22, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r31, %r31;
-; CHECK-SM70-NEXT: or.b32 %r24, %r31, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r25, %r24, %r23, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r25; }
+; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
+; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
+; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
+; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r9, %r8, %r7;
+; CHECK-SM70-NEXT: add.s32 %r10, %r9, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r7, %r7;
+; CHECK-SM70-NEXT: or.b32 %r11, %r7, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
+; CHECK-SM70-NEXT: and.b32 %r13, %r12, -65536;
+; CHECK-SM70-NEXT: max.f32 %r14, %r13, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r15, %r14, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r14;
+; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r14, %r14;
+; CHECK-SM70-NEXT: or.b32 %r18, %r14, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r19; }
; CHECK-SM70-NEXT: st.param.b16 [func_retval0], %rs1;
; CHECK-SM70-NEXT: ret;
%1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
@@ -1414,7 +1414,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<11>;
-; CHECK-SM70-NEXT: .reg .b32 %r<51>;
+; CHECK-SM70-NEXT: .reg .b32 %r<31>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_param_0];
@@ -1422,43 +1422,43 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r41, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r42, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r43, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r44, %r43, %r42, %r41;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r44, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r44;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r44, %r44;
-; CHECK-SM70-NEXT: or.b32 %r18, %r44, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r19; }
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r45, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r46, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r47, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r48, %r47, %r46, %r45;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r48, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r48;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r48, %r48;
-; CHECK-SM70-NEXT: or.b32 %r34, %r48, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r35; }
-; CHECK-SM70-NEXT: and.b32 %r49, %r19, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r49, 0f00000000;
-; CHECK-SM70-NEXT: and.b32 %r50, %r35, -65536;
-; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r50, 0f00000000;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
+; CHECK-SM70-NEXT: and.b32 %r28, %r15, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p3, %r28, 0f00000000;
+; CHECK-SM70-NEXT: and.b32 %r29, %r27, -65536;
+; CHECK-SM70-NEXT: setp.gt.f32 %p4, %r29, 0f00000000;
; CHECK-SM70-NEXT: selp.b16 %rs9, %rs8, 0x0000, %p4;
; CHECK-SM70-NEXT: selp.b16 %rs10, %rs7, 0x0000, %p3;
-; CHECK-SM70-NEXT: mov.b32 %r40, {%rs10, %rs9};
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r40;
+; CHECK-SM70-NEXT: mov.b32 %r30, {%rs10, %rs9};
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r30;
; CHECK-SM70-NEXT: ret;
%1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
%2 = fcmp nsz ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -1487,7 +1487,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK-FTZ: {
; CHECK-FTZ-NEXT: .reg .b16 %rs<5>;
-; CHECK-FTZ-NEXT: .reg .b32 %r<26>;
+; CHECK-FTZ-NEXT: .reg .b32 %r<18>;
; CHECK-FTZ-EMPTY:
; CHECK-FTZ-NEXT: // %bb.0:
; CHECK-FTZ-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
@@ -1496,28 +1496,28 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-FTZ-NEXT: fma.rn.bf16x2 %r4, %r3, %r2, %r1;
; CHECK-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
; CHECK-FTZ-NEXT: cvt.u32.u16 %r5, %rs2;
-; CHECK-FTZ-NEXT: shl.b32 %r22, %r5, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r8, %r22, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r8;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r9, %rs1;
-; CHECK-FTZ-NEXT: shl.b32 %r23, %r9, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r12, %r23, 0f40E00000;
-; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r12;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r13, %rs4;
-; CHECK-FTZ-NEXT: shl.b32 %r24, %r13, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r16, %r24, %r23;
-; CHECK-FTZ-NEXT: cvt.u32.u16 %r17, %rs3;
-; CHECK-FTZ-NEXT: shl.b32 %r25, %r17, 16;
-; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r20, %r25, %r22;
-; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r21, %r20, %r16;
-; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r21;
+; CHECK-FTZ-NEXT: shl.b32 %r6, %r5, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r7;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r8, %rs1;
+; CHECK-FTZ-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r10, %r9, 0f40E00000;
+; CHECK-FTZ-NEXT: cvt.rn.bf16.f32 %rs4, %r10;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r11, %rs4;
+; CHECK-FTZ-NEXT: shl.b32 %r12, %r11, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r13, %r12, %r9;
+; CHECK-FTZ-NEXT: cvt.u32.u16 %r14, %rs3;
+; CHECK-FTZ-NEXT: shl.b32 %r15, %r14, 16;
+; CHECK-FTZ-NEXT: add.rn.ftz.f32 %r16, %r15, %r6;
+; CHECK-FTZ-NEXT: cvt.rn.bf16x2.f32 %r17, %r16, %r13;
+; CHECK-FTZ-NEXT: st.param.b32 [func_retval0], %r17;
; CHECK-FTZ-NEXT: ret;
;
; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<7>;
; CHECK-SM70-NEXT: .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT: .reg .b32 %r<89>;
+; CHECK-SM70-NEXT: .reg .b32 %r<57>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_0];
@@ -1525,67 +1525,67 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_no_nans_multiple_uses_of_fma_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r73, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r74, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r75, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r76, %r75, %r74, %r73;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r76, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r76;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r76, %r76;
-; CHECK-SM70-NEXT: or.b32 %r18, %r76, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r77, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r78, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r79, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r80, %r79, %r78, %r77;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r80, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r80;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r80, %r80;
-; CHECK-SM70-NEXT: or.b32 %r34, %r80, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: and.b32 %r81, %r35, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r82, %r81, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r40, %r82, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r41, %r40, %r82;
-; CHECK-SM70-NEXT: add.s32 %r42, %r41, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r82, %r82;
-; CHECK-SM70-NEXT: or.b32 %r43, %r82, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r44, %r43, %r42, %p3;
-; CHECK-SM70-NEXT: and.b32 %r83, %r19, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r84, %r83, 0f40E00000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r84, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r84;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r84, %r84;
-; CHECK-SM70-NEXT: or.b32 %r52, %r84, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p4;
-; CHECK-SM70-NEXT: and.b32 %r85, %r53, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r86, %r85, %r83;
-; CHECK-SM70-NEXT: bfe.u32 %r58, %r86, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r59, %r58, %r86;
-; CHECK-SM70-NEXT: add.s32 %r60, %r59, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r86, %r86;
-; CHECK-SM70-NEXT: or.b32 %r61, %r86, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r62, %r61, %r60, %p5;
-; CHECK-SM70-NEXT: and.b32 %r87, %r44, -65536;
-; CHECK-SM70-NEXT: add.rn.f32 %r88, %r87, %r81;
-; CHECK-SM70-NEXT: bfe.u32 %r67, %r88, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r68, %r67, %r88;
-; CHECK-SM70-NEXT: add.s32 %r69, %r68, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r88, %r88;
-; CHECK-SM70-NEXT: or.b32 %r70, %r88, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r71, %r70, %r69, %p6;
-; CHECK-SM70-NEXT: prmt.b32 %r72, %r71, %r62, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r72;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs1;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r29, %r28, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r29, %r29;
+; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r36, %r35, 0f40E00000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
+; CHECK-SM70-NEXT: and.b32 %r42, %r41, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r43, %r42, %r35;
+; CHECK-SM70-NEXT: bfe.u32 %r44, %r43, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r45, %r44, %r43;
+; CHECK-SM70-NEXT: add.s32 %r46, %r45, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p5, %r43, %r43;
+; CHECK-SM70-NEXT: or.b32 %r47, %r43, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r48, %r47, %r46, %p5;
+; CHECK-SM70-NEXT: and.b32 %r49, %r34, -65536;
+; CHECK-SM70-NEXT: add.rn.f32 %r50, %r49, %r28;
+; CHECK-SM70-NEXT: bfe.u32 %r51, %r50, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r52, %r51, %r50;
+; CHECK-SM70-NEXT: add.s32 %r53, %r52, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p6, %r50, %r50;
+; CHECK-SM70-NEXT: or.b32 %r54, %r50, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r55, %r54, %r53, %p6;
+; CHECK-SM70-NEXT: prmt.b32 %r56, %r55, %r48, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r56;
; CHECK-SM70-NEXT: ret;
%1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
%2 = fcmp nsz ogt <2 x bfloat> %1, <bfloat 0.0, bfloat 0.0>
@@ -1624,7 +1624,7 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70: {
; CHECK-SM70-NEXT: .reg .pred %p<5>;
; CHECK-SM70-NEXT: .reg .b16 %rs<7>;
-; CHECK-SM70-NEXT: .reg .b32 %r<67>;
+; CHECK-SM70-NEXT: .reg .b32 %r<43>;
; CHECK-SM70-EMPTY:
; CHECK-SM70-NEXT: // %bb.0:
; CHECK-SM70-NEXT: ld.param.b32 %r1, [fma_bf16x2_maxnum_no_nans_param_0];
@@ -1632,51 +1632,51 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
; CHECK-SM70-NEXT: ld.param.b32 %r3, [fma_bf16x2_maxnum_no_nans_param_2];
; CHECK-SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3;
; CHECK-SM70-NEXT: cvt.u32.u16 %r4, %rs1;
-; CHECK-SM70-NEXT: shl.b32 %r55, %r4, 16;
+; CHECK-SM70-NEXT: shl.b32 %r5, %r4, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r7, %rs3;
-; CHECK-SM70-NEXT: shl.b32 %r56, %r7, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r6, %rs3;
+; CHECK-SM70-NEXT: shl.b32 %r7, %r6, 16;
; CHECK-SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r10, %rs5;
-; CHECK-SM70-NEXT: shl.b32 %r57, %r10, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r58, %r57, %r56, %r55;
-; CHECK-SM70-NEXT: bfe.u32 %r15, %r58, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r16, %r15, %r58;
-; CHECK-SM70-NEXT: add.s32 %r17, %r16, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r58, %r58;
-; CHECK-SM70-NEXT: or.b32 %r18, %r58, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r19, %r18, %r17, %p1;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs2;
-; CHECK-SM70-NEXT: shl.b32 %r59, %r20, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r23, %rs4;
-; CHECK-SM70-NEXT: shl.b32 %r60, %r23, 16;
-; CHECK-SM70-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECK-SM70-NEXT: shl.b32 %r61, %r26, 16;
-; CHECK-SM70-NEXT: fma.rn.f32 %r62, %r61, %r60, %r59;
-; CHECK-SM70-NEXT: bfe.u32 %r31, %r62, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r32, %r31, %r62;
-; CHECK-SM70-NEXT: add.s32 %r33, %r32, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r62, %r62;
-; CHECK-SM70-NEXT: or.b32 %r34, %r62, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r35, %r34, %r33, %p2;
-; CHECK-SM70-NEXT: and.b32 %r63, %r35, -65536;
-; CHECK-SM70-NEXT: max.f32 %r64, %r63, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r40, %r64, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r41, %r40, %r64;
-; CHECK-SM70-NEXT: add.s32 %r42, %r41, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r64, %r64;
-; CHECK-SM70-NEXT: or.b32 %r43, %r64, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r44, %r43, %r42, %p3;
-; CHECK-SM70-NEXT: and.b32 %r65, %r19, -65536;
-; CHECK-SM70-NEXT: max.f32 %r66, %r65, 0f00000000;
-; CHECK-SM70-NEXT: bfe.u32 %r49, %r66, 16, 1;
-; CHECK-SM70-NEXT: add.s32 %r50, %r49, %r66;
-; CHECK-SM70-NEXT: add.s32 %r51, %r50, 32767;
-; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r66, %r66;
-; CHECK-SM70-NEXT: or.b32 %r52, %r66, 4194304;
-; CHECK-SM70-NEXT: selp.b32 %r53, %r52, %r51, %p4;
-; CHECK-SM70-NEXT: prmt.b32 %r54, %r53, %r44, 0x7632U;
-; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r54;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r8, %rs5;
+; CHECK-SM70-NEXT: shl.b32 %r9, %r8, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r10, %r9, %r7, %r5;
+; CHECK-SM70-NEXT: bfe.u32 %r11, %r10, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r12, %r11, %r10;
+; CHECK-SM70-NEXT: add.s32 %r13, %r12, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p1, %r10, %r10;
+; CHECK-SM70-NEXT: or.b32 %r14, %r10, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r15, %r14, %r13, %p1;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r16, %rs2;
+; CHECK-SM70-NEXT: shl.b32 %r17, %r16, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT: shl.b32 %r19, %r18, 16;
+; CHECK-SM70-NEXT: cvt.u32.u16 %r20, %rs6;
+; CHECK-SM70-NEXT: shl.b32 %r21, %r20, 16;
+; CHECK-SM70-NEXT: fma.rn.f32 %r22, %r21, %r19, %r17;
+; CHECK-SM70-NEXT: bfe.u32 %r23, %r22, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r24, %r23, %r22;
+; CHECK-SM70-NEXT: add.s32 %r25, %r24, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p2, %r22, %r22;
+; CHECK-SM70-NEXT: or.b32 %r26, %r22, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r27, %r26, %r25, %p2;
+; CHECK-SM70-NEXT: and.b32 %r28, %r27, -65536;
+; CHECK-SM70-NEXT: max.f32 %r29, %r28, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r30, %r29, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r31, %r30, %r29;
+; CHECK-SM70-NEXT: add.s32 %r32, %r31, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p3, %r29, %r29;
+; CHECK-SM70-NEXT: or.b32 %r33, %r29, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r34, %r33, %r32, %p3;
+; CHECK-SM70-NEXT: and.b32 %r35, %r15, -65536;
+; CHECK-SM70-NEXT: max.f32 %r36, %r35, 0f00000000;
+; CHECK-SM70-NEXT: bfe.u32 %r37, %r36, 16, 1;
+; CHECK-SM70-NEXT: add.s32 %r38, %r37, %r36;
+; CHECK-SM70-NEXT: add.s32 %r39, %r38, 32767;
+; CHECK-SM70-NEXT: setp.nan.f32 %p4, %r36, %r36;
+; CHECK-SM70-NEXT: or.b32 %r40, %r36, 4194304;
+; CHECK-SM70-NEXT: selp.b32 %r41, %r40, %r39, %p4;
+; CHECK-SM70-NEXT: prmt.b32 %r42, %r41, %r34, 0x7632U;
+; CHECK-SM70-NEXT: st.param.b32 [func_retval0], %r42;
; CHECK-SM70-NEXT: ret;
%1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
%2 = call nsz <2 x bfloat> @llvm.maxnum.bf16x2(<2 x bfloat> %1, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 90847effb6d3f..4e1d13696edfe 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1140,12 +1140,11 @@ define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
; CHECK-LABEL: test_bitcast_float_to_4xi8(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_float_to_4xi8_param_0];
-; CHECK-NEXT: mov.b32 %r2, %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%r = bitcast float %a to <4 x i8>
ret <4 x i8> %r
@@ -1167,12 +1166,11 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
; CHECK-LABEL: test_bitcast_4xi8_to_float(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_4xi8_to_float_param_0];
-; CHECK-NEXT: mov.b32 %r2, %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%r = bitcast <4 x i8> %a to float
ret float %r
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm.ll b/llvm/test/CodeGen/NVPTX/inline-asm.ll
index 8630a68789e3e..64c0cd71f6117 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm.ll
@@ -5,14 +5,14 @@
define float @test(float %x) {
; CHECK-LABEL: test(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b32 %r2, [test_param_0];
; CHECK-NEXT: // begin inline asm
-; CHECK-NEXT: ex2.approx.ftz.f32 %r4, %r2;
+; CHECK-NEXT: ex2.approx.ftz.f32 %r1, %r2;
; CHECK-NEXT: // end inline asm
-; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
entry:
%0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index e72316ad47136..71af7a7d475d3 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -140,21 +140,21 @@ define float @round_float(float %a) {
; CHECK-LABEL: round_float(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r12, [round_float_param_0];
-; CHECK-NEXT: and.b32 %r3, %r12, -2147483648;
-; CHECK-NEXT: or.b32 %r13, %r3, 1056964608;
-; CHECK-NEXT: add.rn.f32 %r6, %r12, %r13;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r7, %r6;
-; CHECK-NEXT: abs.f32 %r8, %r12;
-; CHECK-NEXT: setp.gt.f32 %p1, %r8, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r9, %r12, %r7, %p1;
-; CHECK-NEXT: cvt.rzi.f32.f32 %r10, %r12;
-; CHECK-NEXT: setp.lt.f32 %p2, %r8, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r11, %r10, %r9, %p2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: ld.param.b32 %r1, [round_float_param_0];
+; CHECK-NEXT: and.b32 %r2, %r1, -2147483648;
+; CHECK-NEXT: or.b32 %r3, %r2, 1056964608;
+; CHECK-NEXT: add.rn.f32 %r4, %r1, %r3;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r5, %r4;
+; CHECK-NEXT: abs.f32 %r6, %r1;
+; CHECK-NEXT: setp.gt.f32 %p1, %r6, 0f4B000000;
+; CHECK-NEXT: selp.f32 %r7, %r1, %r5, %p1;
+; CHECK-NEXT: cvt.rzi.f32.f32 %r8, %r1;
+; CHECK-NEXT: setp.lt.f32 %p2, %r6, 0f3F000000;
+; CHECK-NEXT: selp.f32 %r9, %r8, %r7, %p2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%b = call float @llvm.round.f32(float %a)
ret float %b
@@ -165,21 +165,21 @@ define float @round_float_ftz(float %a) #1 {
; CHECK-LABEL: round_float_ftz(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<14>;
+; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r12, [round_float_ftz_param_0];
-; CHECK-NEXT: and.b32 %r3, %r12, -2147483648;
-; CHECK-NEXT: or.b32 %r13, %r3, 1056964608;
-; CHECK-NEXT: add.rn.ftz.f32 %r6, %r12, %r13;
-; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r7, %r6;
-; CHECK-NEXT: abs.ftz.f32 %r8, %r12;
-; CHECK-NEXT: setp.gt.ftz.f32 %p1, %r8, 0f4B000000;
-; CHECK-NEXT: selp.f32 %r9, %r12, %r7, %p1;
-; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r10, %r12;
-; CHECK-NEXT: setp.lt.ftz.f32 %p2, %r8, 0f3F000000;
-; CHECK-NEXT: selp.f32 %r11, %r10, %r9, %p2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
+; CHECK-NEXT: ld.param.b32 %r1, [round_float_ftz_param_0];
+; CHECK-NEXT: and.b32 %r2, %r1, -2147483648;
+; CHECK-NEXT: or.b32 %r3, %r2, 1056964608;
+; CHECK-NEXT: add.rn.ftz.f32 %r4, %r1, %r3;
+; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r5, %r4;
+; CHECK-NEXT: abs.ftz.f32 %r6, %r1;
+; CHECK-NEXT: setp.gt.ftz.f32 %p1, %r6, 0f4B000000;
+; CHECK-NEXT: selp.f32 %r7, %r1, %r5, %p1;
+; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r8, %r1;
+; CHECK-NEXT: setp.lt.ftz.f32 %p2, %r6, 0f3F000000;
+; CHECK-NEXT: selp.f32 %r9, %r8, %r7, %p2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%b = call float @llvm.round.f32(float %a)
ret float %b
@@ -678,21 +678,21 @@ define float @minimum_float(float %a, float %b) {
; CHECK-NOF16-LABEL: minimum_float(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .pred %p<5>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<12>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r10, [minimum_float_param_0];
-; CHECK-NOF16-NEXT: ld.param.b32 %r11, [minimum_float_param_1];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r10, %r11;
-; CHECK-NOF16-NEXT: min.f32 %r4, %r10, %r11;
-; CHECK-NOF16-NEXT: selp.f32 %r5, 0f7FC00000, %r4, %p1;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r10, -2147483648;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r10, %r5, %p2;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r11, -2147483648;
-; CHECK-NOF16-NEXT: selp.f32 %r8, %r11, %r6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r9, %r8, %r5, %p4;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_float_param_0];
+; CHECK-NOF16-NEXT: ld.param.b32 %r2, [minimum_float_param_1];
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r2;
+; CHECK-NOF16-NEXT: min.f32 %r3, %r1, %r2;
+; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r2, -2147483648;
+; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimum_float(
@@ -724,18 +724,18 @@ define float @minimum_imm1(float %a) {
; CHECK-NOF16-LABEL: minimum_imm1(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .pred %p<4>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<6>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r7, [minimum_imm1_param_0];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r7, %r7;
-; CHECK-NOF16-NEXT: min.f32 %r3, %r7, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r7, -2147483648;
-; CHECK-NOF16-NEXT: selp.f32 %r5, %r7, %r4, %p2;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r4, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r5, %r4, %p3;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_imm1_param_0];
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT: min.f32 %r2, %r1, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT: selp.f32 %r4, %r1, %r3, %p2;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r3, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r4, %r3, %p3;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimum_imm1(
@@ -765,18 +765,18 @@ define float @minimum_imm2(float %a) {
; CHECK-NOF16-LABEL: minimum_imm2(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .pred %p<4>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<6>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r7, [minimum_imm2_param_0];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r7, %r7;
-; CHECK-NOF16-NEXT: min.f32 %r3, %r7, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r7, -2147483648;
-; CHECK-NOF16-NEXT: selp.f32 %r5, %r7, %r4, %p2;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r4, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r5, %r4, %p3;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_imm2_param_0];
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NOF16-NEXT: min.f32 %r2, %r1, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT: selp.f32 %r4, %r1, %r3, %p2;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r3, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r4, %r3, %p3;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimum_imm2(
@@ -806,21 +806,21 @@ define float @minimum_float_ftz(float %a, float %b) #1 {
; CHECK-NOF16-LABEL: minimum_float_ftz(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .pred %p<5>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<12>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r10, [minimum_float_ftz_param_0];
-; CHECK-NOF16-NEXT: ld.param.b32 %r11, [minimum_float_ftz_param_1];
-; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %r10, %r11;
-; CHECK-NOF16-NEXT: min.ftz.f32 %r4, %r10, %r11;
-; CHECK-NOF16-NEXT: selp.f32 %r5, 0f7FC00000, %r4, %p1;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r10, -2147483648;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r10, %r5, %p2;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r11, -2147483648;
-; CHECK-NOF16-NEXT: selp.f32 %r8, %r11, %r6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r9, %r8, %r5, %p4;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_float_ftz_param_0];
+; CHECK-NOF16-NEXT: ld.param.b32 %r2, [minimum_float_ftz_param_1];
+; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %r1, %r2;
+; CHECK-NOF16-NEXT: min.ftz.f32 %r3, %r1, %r2;
+; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r1, -2147483648;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r2, -2147483648;
+; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %r4, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: minimum_float_ftz(
@@ -852,21 +852,21 @@ define double @minimum_double(double %a, double %b) {
; CHECK-LABEL: minimum_double(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b64 %rd<12>;
+; CHECK-NEXT: .reg .b64 %rd<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd10, [minimum_double_param_0];
-; CHECK-NEXT: ld.param.b64 %rd11, [minimum_double_param_1];
-; CHECK-NEXT: setp.nan.f64 %p1, %rd10, %rd11;
-; CHECK-NEXT: min.f64 %rd4, %rd10, %rd11;
-; CHECK-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
-; CHECK-NEXT: setp.eq.s64 %p2, %rd10, -9223372036854775808;
-; CHECK-NEXT: selp.f64 %rd6, %rd10, %rd5, %p2;
-; CHECK-NEXT: setp.eq.s64 %p3, %rd11, -9223372036854775808;
-; CHECK-NEXT: selp.f64 %rd8, %rd11, %rd6, %p3;
-; CHECK-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
-; CHECK-NEXT: selp.f64 %rd9, %rd8, %rd5, %p4;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd9;
+; CHECK-NEXT: ld.param.b64 %rd1, [minimum_double_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [minimum_double_param_1];
+; CHECK-NEXT: setp.nan.f64 %p1, %rd1, %rd2;
+; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: selp.f64 %rd4, 0d7FF8000000000000, %rd3, %p1;
+; CHECK-NEXT: setp.eq.s64 %p2, %rd1, -9223372036854775808;
+; CHECK-NEXT: selp.f64 %rd5, %rd1, %rd4, %p2;
+; CHECK-NEXT: setp.eq.s64 %p3, %rd2, -9223372036854775808;
+; CHECK-NEXT: selp.f64 %rd6, %rd2, %rd5, %p3;
+; CHECK-NEXT: setp.eq.f64 %p4, %rd4, 0d0000000000000000;
+; CHECK-NEXT: selp.f64 %rd7, %rd6, %rd4, %p4;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd7;
; CHECK-NEXT: ret;
%x = call double @llvm.minimum.f64(double %a, double %b)
ret double %x
@@ -1297,21 +1297,21 @@ define float @maximum_float(float %a, float %b) {
; CHECK-NOF16-LABEL: maximum_float(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .pred %p<5>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<12>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r10, [maximum_float_param_0];
-; CHECK-NOF16-NEXT: ld.param.b32 %r11, [maximum_float_param_1];
-; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r10, %r11;
-; CHECK-NOF16-NEXT: max.f32 %r4, %r10, %r11;
-; CHECK-NOF16-NEXT: selp.f32 %r5, 0f7FC00000, %r4, %p1;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r10, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r10, %r5, %p2;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r11, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r8, %r11, %r6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r9, %r8, %r5, %p4;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NOF16-NEXT: ld.param.b32 %r1, [maximum_float_param_0];
+; CHECK-NOF16-NEXT: ld.param.b32 %r2, [maximum_float_param_1];
+; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r2;
+; CHECK-NOF16-NEXT: max.f32 %r3, %r1, %r2;
+; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r1, 0;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r2, 0;
+; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: maximum_float(
@@ -1343,21 +1343,21 @@ define float @maximum_float_ftz(float %a, float %b) #1 {
; CHECK-NOF16-LABEL: maximum_float_ftz(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .pred %p<5>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<12>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<8>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.b32 %r10, [maximum_float_ftz_param_0];
-; CHECK-NOF16-NEXT: ld.param.b32 %r11, [maximum_float_ftz_param_1];
-; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %r10, %r11;
-; CHECK-NOF16-NEXT: max.ftz.f32 %r4, %r10, %r11;
-; CHECK-NOF16-NEXT: selp.f32 %r5, 0f7FC00000, %r4, %p1;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r10, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r6, %r10, %r5, %p2;
-; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r11, 0;
-; CHECK-NOF16-NEXT: selp.f32 %r8, %r11, %r6, %p3;
-; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %r5, 0f00000000;
-; CHECK-NOF16-NEXT: selp.f32 %r9, %r8, %r5, %p4;
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NOF16-NEXT: ld.param.b32 %r1, [maximum_float_ftz_param_0];
+; CHECK-NOF16-NEXT: ld.param.b32 %r2, [maximum_float_ftz_param_1];
+; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %r1, %r2;
+; CHECK-NOF16-NEXT: max.ftz.f32 %r3, %r1, %r2;
+; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p2, %r1, 0;
+; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2;
+; CHECK-NOF16-NEXT: setp.eq.s32 %p3, %r2, 0;
+; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %r4, 0f00000000;
+; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-NOF16-NEXT: ret;
;
; CHECK-F16-LABEL: maximum_float_ftz(
@@ -1389,21 +1389,21 @@ define double @maximum_double(double %a, double %b) {
; CHECK-LABEL: maximum_double(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b64 %rd<12>;
+; CHECK-NEXT: .reg .b64 %rd<8>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd10, [maximum_double_param_0];
-; CHECK-NEXT: ld.param.b64 %rd11, [maximum_double_param_1];
-; CHECK-NEXT: setp.nan.f64 %p1, %rd10, %rd11;
-; CHECK-NEXT: max.f64 %rd4, %rd10, %rd11;
-; CHECK-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
-; CHECK-NEXT: setp.eq.s64 %p2, %rd10, 0;
-; CHECK-NEXT: selp.f64 %rd6, %rd10, %rd5, %p2;
-; CHECK-NEXT: setp.eq.s64 %p3, %rd11, 0;
-; CHECK-NEXT: selp.f64 %rd8, %rd11, %rd6, %p3;
-; CHECK-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
-; CHECK-NEXT: selp.f64 %rd9, %rd8, %rd5, %p4;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd9;
+; CHECK-NEXT: ld.param.b64 %rd1, [maximum_double_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [maximum_double_param_1];
+; CHECK-NEXT: setp.nan.f64 %p1, %rd1, %rd2;
+; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: selp.f64 %rd4, 0d7FF8000000000000, %rd3, %p1;
+; CHECK-NEXT: setp.eq.s64 %p2, %rd1, 0;
+; CHECK-NEXT: selp.f64 %rd5, %rd1, %rd4, %p2;
+; CHECK-NEXT: setp.eq.s64 %p3, %rd2, 0;
+; CHECK-NEXT: selp.f64 %rd6, %rd2, %rd5, %p3;
+; CHECK-NEXT: setp.eq.f64 %p4, %rd4, 0d0000000000000000;
+; CHECK-NEXT: selp.f64 %rd7, %rd6, %rd4, %p4;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd7;
; CHECK-NEXT: ret;
%x = call double @llvm.maximum.f64(double %a, double %b)
ret double %x
diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll
index 78ec183ede04d..4fc8786c1e2fe 100644
--- a/llvm/test/CodeGen/NVPTX/param-add.ll
+++ b/llvm/test/CodeGen/NVPTX/param-add.ll
@@ -14,7 +14,7 @@ declare i32 @callee(%struct.1float %a)
define i32 @test(%struct.1float alignstack(32) %data) {
; CHECK-LABEL: test(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<20>;
+; CHECK-NEXT: .reg .b32 %r<16>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %r1, [test_param_0+1];
@@ -26,25 +26,25 @@ define i32 @test(%struct.1float alignstack(32) %data) {
; CHECK-NEXT: ld.param.b8 %r7, [test_param_0+2];
; CHECK-NEXT: or.b32 %r8, %r6, %r7;
; CHECK-NEXT: shl.b32 %r9, %r8, 16;
-; CHECK-NEXT: or.b32 %r19, %r9, %r4;
-; CHECK-NEXT: shr.u32 %r13, %r19, 8;
-; CHECK-NEXT: shr.u32 %r14, %r19, 16;
-; CHECK-NEXT: shr.u32 %r15, %r19, 24;
+; CHECK-NEXT: or.b32 %r10, %r9, %r4;
+; CHECK-NEXT: shr.u32 %r11, %r10, 8;
+; CHECK-NEXT: shr.u32 %r12, %r10, 16;
+; CHECK-NEXT: shr.u32 %r13, %r10, 24;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 1 .b8 param0[4];
-; CHECK-NEXT: st.param.b8 [param0], %r19;
-; CHECK-NEXT: st.param.b8 [param0+1], %r13;
-; CHECK-NEXT: st.param.b8 [param0+2], %r14;
-; CHECK-NEXT: st.param.b8 [param0+3], %r15;
+; CHECK-NEXT: st.param.b8 [param0], %r10;
+; CHECK-NEXT: st.param.b8 [param0+1], %r11;
+; CHECK-NEXT: st.param.b8 [param0+2], %r12;
+; CHECK-NEXT: st.param.b8 [param0+3], %r13;
; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: call.uni (retval0),
; CHECK-NEXT: callee,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
-; CHECK-NEXT: ld.param.b32 %r16, [retval0];
+; CHECK-NEXT: ld.param.b32 %r14, [retval0];
; CHECK-NEXT: } // callseq 0
-; CHECK-NEXT: st.param.b32 [func_retval0], %r16;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
; CHECK-NEXT: ret;
%1 = call i32 @callee(%struct.1float %data)
>From debd06ecaf36b98c27be3728b67d80d3c1755f8a Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 22 May 2025 16:32:30 +0000
Subject: [PATCH 2/6] more cleanup
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 678 +++++++++++------------
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 505 ++++++++---------
2 files changed, 571 insertions(+), 612 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 27035064c1f03..8e67aef76dced 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -282,22 +282,20 @@ class BasicNVPTXInst<dag outs, dag insv, string asmstr, list<dag> pattern = []>
multiclass I3Inst<string op_str, SDPatternOperator op_node, RegTyInfo t,
bit commutative, list<Predicate> requires = []> {
- defvar asmstr = op_str # " \t$dst, $a, $b;";
-
def rr :
- NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
- asmstr,
+ BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b),
+ op_str,
[(set t.Ty:$dst, (op_node t.Ty:$a, t.Ty:$b))]>,
Requires<requires>;
def ri :
- NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
- asmstr,
+ BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.Imm:$b),
+ op_str,
[(set t.Ty:$dst, (op_node t.Ty:$a, (t.Ty imm:$b)))]>,
Requires<requires>;
if !not(commutative) then
def ir :
- NVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
- asmstr,
+ BasicNVPTXInst<(outs t.RC:$dst), (ins t.Imm:$a, t.RC:$b),
+ op_str,
[(set t.Ty:$dst, (op_node (t.Ty imm:$a), t.Ty:$b))]>,
Requires<requires>;
}
@@ -310,8 +308,8 @@ multiclass I3<string op_str, SDPatternOperator op_node, bit commutative> {
}
class I16x2<string OpcStr, SDNode OpNode> :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "16x2 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ OpcStr # "16x2",
[(set v2i16:$dst, (OpNode v2i16:$a, v2i16:$b))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
@@ -332,74 +330,74 @@ multiclass ADD_SUB_INT_CARRY<string op_str, SDNode op_node, bit commutative> {
multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
if !not(NaN) then {
def f64rr :
- NVPTXInst<(outs Float64Regs:$dst),
+ BasicNVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ OpcStr # ".f64",
[(set f64:$dst, (OpNode f64:$a, f64:$b))]>;
def f64ri :
- NVPTXInst<(outs Float64Regs:$dst),
+ BasicNVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
- !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+ OpcStr # ".f64",
[(set f64:$dst, (OpNode f64:$a, fpimm:$b))]>;
}
def f32rr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ OpcStr # ".ftz.f32",
[(set f32:$dst, (OpNode f32:$a, f32:$b))]>,
Requires<[doF32FTZ]>;
def f32ri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+ OpcStr # ".ftz.f32",
[(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[doF32FTZ]>;
def f32rr :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ OpcStr # ".f32",
[(set f32:$dst, (OpNode f32:$a, f32:$b))]>;
def f32ri :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
- !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+ OpcStr # ".f32",
[(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>;
def f16rr_ftz :
- NVPTXInst<(outs Int16Regs:$dst),
+ BasicNVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
+ OpcStr # ".ftz.f16",
[(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, doF32FTZ]>;
def f16rr :
- NVPTXInst<(outs Int16Regs:$dst),
+ BasicNVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
+ OpcStr # ".f16",
[(set f16:$dst, (OpNode f16:$a, f16:$b))]>,
Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
def f16x2rr_ftz :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
+ OpcStr # ".ftz.f16x2",
[(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, hasSM<80>, hasPTX<70>, doF32FTZ]>;
def f16x2rr :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
+ OpcStr # ".f16x2",
[(set v2f16:$dst, (OpNode v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, hasSM<80>, hasPTX<70>]>;
def bf16rr :
- NVPTXInst<(outs Int16Regs:$dst),
+ BasicNVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
- !strconcat(OpcStr, ".bf16 \t$dst, $a, $b;"),
+ OpcStr # ".bf16",
[(set bf16:$dst, (OpNode bf16:$a, bf16:$b))]>,
Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
def bf16x2rr :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, ".bf16x2 \t$dst, $a, $b;"),
+ OpcStr # ".bf16x2",
[(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
Requires<[hasBF16Math, hasSM<80>, hasPTX<70>]>;
}
@@ -415,74 +413,74 @@ multiclass FMINIMUMMAXIMUM<string OpcStr, bit NaN, SDNode OpNode> {
// just like the non ".rn" op, but prevents ptxas from creating FMAs.
multiclass F3<string op_str, SDPatternOperator op_pat> {
def f64rr :
- NVPTXInst<(outs Float64Regs:$dst),
+ BasicNVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),
- op_str # ".f64 \t$dst, $a, $b;",
+ op_str # ".f64",
[(set f64:$dst, (op_pat f64:$a, f64:$b))]>;
def f64ri :
- NVPTXInst<(outs Float64Regs:$dst),
+ BasicNVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
- op_str # ".f64 \t$dst, $a, $b;",
+ op_str # ".f64",
[(set f64:$dst, (op_pat f64:$a, fpimm:$b))]>;
def f32rr_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
- op_str # ".ftz.f32 \t$dst, $a, $b;",
+ op_str # ".ftz.f32",
[(set f32:$dst, (op_pat f32:$a, f32:$b))]>,
Requires<[doF32FTZ]>;
def f32ri_ftz :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
- op_str # ".ftz.f32 \t$dst, $a, $b;",
+ op_str # ".ftz.f32",
[(set f32:$dst, (op_pat f32:$a, fpimm:$b))]>,
Requires<[doF32FTZ]>;
def f32rr :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
- op_str # ".f32 \t$dst, $a, $b;",
+ op_str # ".f32",
[(set f32:$dst, (op_pat f32:$a, f32:$b))]>;
def f32ri :
- NVPTXInst<(outs Float32Regs:$dst),
+ BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
- op_str # ".f32 \t$dst, $a, $b;",
+ op_str # ".f32",
[(set f32:$dst, (op_pat f32:$a, fpimm:$b))]>;
def f16rr_ftz :
- NVPTXInst<(outs Int16Regs:$dst),
+ BasicNVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
- op_str # ".ftz.f16 \t$dst, $a, $b;",
+ op_str # ".ftz.f16",
[(set f16:$dst, (op_pat f16:$a, f16:$b))]>,
Requires<[useFP16Math, doF32FTZ]>;
def f16rr :
- NVPTXInst<(outs Int16Regs:$dst),
+ BasicNVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
- op_str # ".f16 \t$dst, $a, $b;",
+ op_str # ".f16",
[(set f16:$dst, (op_pat f16:$a, f16:$b))]>,
Requires<[useFP16Math]>;
def f16x2rr_ftz :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
- op_str # ".ftz.f16x2 \t$dst, $a, $b;",
+ op_str # ".ftz.f16x2",
[(set v2f16:$dst, (op_pat v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math, doF32FTZ]>;
def f16x2rr :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
- op_str # ".f16x2 \t$dst, $a, $b;",
+ op_str # ".f16x2",
[(set v2f16:$dst, (op_pat v2f16:$a, v2f16:$b))]>,
Requires<[useFP16Math]>;
def bf16rr :
- NVPTXInst<(outs Int16Regs:$dst),
+ BasicNVPTXInst<(outs Int16Regs:$dst),
(ins Int16Regs:$a, Int16Regs:$b),
- op_str # ".bf16 \t$dst, $a, $b;",
+ op_str # ".bf16",
[(set bf16:$dst, (op_pat bf16:$a, bf16:$b))]>,
Requires<[hasBF16Math]>;
def bf16x2rr :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b),
- op_str # ".bf16x2 \t$dst, $a, $b;",
+ op_str # ".bf16x2",
[(set v2bf16:$dst, (op_pat v2bf16:$a, v2bf16:$b))]>,
Requires<[hasBF16Math]>;
}
@@ -502,41 +500,41 @@ multiclass F3_fma_component<string op_str, SDNode op_node> {
// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
// subnormal inputs and results to zero).
multiclass F2<string OpcStr, SDNode OpNode> {
- def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
- !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+ def f64 : BasicNVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+ OpcStr # ".f64",
[(set f64:$dst, (OpNode f64:$a))]>;
- def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
- !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+ def f32_ftz : BasicNVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".ftz.f32",
[(set f32:$dst, (OpNode f32:$a))]>,
Requires<[doF32FTZ]>;
- def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
- !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+ def f32 : BasicNVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+ OpcStr # ".f32",
[(set f32:$dst, (OpNode f32:$a))]>;
}
multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
- def bf16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
- !strconcat(OpcStr, ".bf16 \t$dst, $a;"),
+ def bf16 : BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+ OpcStr # ".bf16",
[(set bf16:$dst, (OpNode bf16:$a))]>,
Requires<[hasSM<80>, hasPTX<70>]>;
- def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
- !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"),
+ def bf16x2 : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+ OpcStr # ".bf16x2",
[(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
Requires<[hasSM<80>, hasPTX<70>]>;
- def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
- !strconcat(OpcStr, ".ftz.f16 \t$dst, $a;"),
+ def f16_ftz : BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+ OpcStr # ".ftz.f16",
[(set f16:$dst, (OpNode f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
- def f16x2_ftz : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
- !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a;"),
+ def f16x2_ftz : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+ OpcStr # ".ftz.f16x2",
[(set v2f16:$dst, (OpNode v2f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>, doF32FTZ]>;
- def f16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
- !strconcat(OpcStr, ".f16 \t$dst, $a;"),
+ def f16 : BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+ OpcStr # ".f16",
[(set f16:$dst, (OpNode f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>]>;
- def f16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
- !strconcat(OpcStr, ".f16x2 \t$dst, $a;"),
+ def f16x2 : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+ OpcStr # ".f16x2",
[(set v2f16:$dst, (OpNode v2f16:$a))]>,
Requires<[hasSM<53>, hasPTX<65>]>;
@@ -544,12 +542,12 @@ multiclass F2_Support_Half<string OpcStr, SDNode OpNode> {
// Variant where only .ftz.bf16 is supported.
multiclass F2_Support_Half_BF<string OpcStr, SDNode OpNode> {
- def bf16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
- OpcStr # ".ftz.bf16 \t$dst, $a;",
+ def bf16_ftz : BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a),
+ OpcStr # ".ftz.bf16",
[(set bf16:$dst, (OpNode bf16:$a))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
- def bf16x2_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
- OpcStr # ".ftz.bf16x2 \t$dst, $a;",
+ def bf16x2_ftz: BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+ OpcStr # ".ftz.bf16x2",
[(set v2bf16:$dst, (OpNode v2bf16:$a))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
}
@@ -568,83 +566,71 @@ let hasSideEffects = false in {
// be CvtNONE to omit a conversion mode.
multiclass CVT_FROM_ALL<string ToType, RegisterClass RC, list<Predicate> Preds = []> {
def _s8 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".s8 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s8">,
Requires<Preds>;
def _u8 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".u8 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u8">,
Requires<Preds>;
def _s16 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".s16 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s16">,
Requires<Preds>;
def _u16 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".u16 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u16">,
Requires<Preds>;
def _s32 :
- NVPTXInst<(outs RC:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".s32 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s32">,
Requires<Preds>;
def _u32 :
- NVPTXInst<(outs RC:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".u32 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int32Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u32">,
Requires<Preds>;
def _s64 :
- NVPTXInst<(outs RC:$dst),
- (ins Int64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".s64 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".s64">,
Requires<Preds>;
def _u64 :
- NVPTXInst<(outs RC:$dst),
- (ins Int64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".u64 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int64Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".u64">,
Requires<Preds>;
def _f16 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".f16 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".f16">,
Requires<Preds>;
def _bf16 :
- NVPTXInst<(outs RC:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
- ToType, ".bf16 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}." # ToType # ".bf16">,
Requires<!if(!eq(ToType, "f32"),
// bf16->f32 was introduced early.
[hasPTX<71>, hasSM<80>],
// bf16->everything else needs sm90/ptx78
[hasPTX<78>, hasSM<90>])>;
def _f32 :
- NVPTXInst<(outs RC:$dst),
- (ins Float32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}.",
- ToType, ".f32 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Float32Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:relu}${mode:sat}." # ToType # ".f32">,
Requires<!if(!eq(ToType, "bf16"),
// f32->bf16 was introduced early.
[hasPTX<70>, hasSM<80>],
Preds)>;
def _f64 :
- NVPTXInst<(outs RC:$dst),
- (ins Float64Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
- ToType, ".f64 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Float64Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:ftz}${mode:sat}." # ToType # ".f64">,
Requires<Preds>;
}
@@ -664,25 +650,24 @@ let hasSideEffects = false in {
// These cvts are different from those above: The source and dest registers
// are of the same type.
- def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "cvt.s16.s8 \t$dst, $src;", []>;
- def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "cvt.s32.s8 \t$dst, $src;", []>;
- def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "cvt.s32.s16 \t$dst, $src;", []>;
- def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s8 \t$dst, $src;", []>;
- def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s16 \t$dst, $src;", []>;
- def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "cvt.s64.s32 \t$dst, $src;", []>;
+ def CVT_INREG_s16_s8 : BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "cvt.s16.s8">;
+ def CVT_INREG_s32_s8 : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s8">;
+ def CVT_INREG_s32_s16 : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "cvt.s32.s16">;
+ def CVT_INREG_s64_s8 : BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s8">;
+ def CVT_INREG_s64_s16 : BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s16">;
+ def CVT_INREG_s64_s32 : BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "cvt.s64.s32">;
multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
def _f32 :
- NVPTXInst<(outs RC:$dst),
- (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:relu}.",
- FromName, ".f32 \t$dst, $src1, $src2;"), []>,
+ BasicFlagsNVPTXInst<(outs RC:$dst),
+ (ins Float32Regs:$src1, Float32Regs:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}." # FromName # ".f32">,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -692,16 +677,14 @@ let hasSideEffects = false in {
// FP8 conversions.
multiclass CVT_TO_F8X2<string F8Name> {
def _f32 :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
- !strconcat("cvt${mode:base}.satfinite${mode:relu}.",
- F8Name, "x2.f32 \t$dst, $src1, $src2;"), []>,
+ BasicFlagsNVPTXInst<(outs Int16Regs:$dst),
+ (ins Float32Regs:$src1, Float32Regs:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # F8Name # "x2.f32">,
Requires<[hasPTX<81>, hasSM<89>]>;
def _f16x2 :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}.satfinite${mode:relu}.",
- F8Name, "x2.f16x2 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs Int16Regs:$dst),
+ (ins Int32Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # F8Name # "x2.f16x2">,
Requires<[hasPTX<81>, hasSM<89>]>;
}
@@ -709,10 +692,9 @@ let hasSideEffects = false in {
defm CVT_e5m2x2 : CVT_TO_F8X2<"e5m2">;
class CVT_f16x2_fp8<string F8Name> :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- !strconcat("cvt${mode:base}${mode:relu}.f16x2.",
- F8Name, "x2 \t$dst, $src;"), []>,
+ BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.f16x2." # F8Name # "x2">,
Requires<[hasPTX<81>, hasSM<89>]>;
def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">;
@@ -722,8 +704,8 @@ let hasSideEffects = false in {
multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> {
defvar Intr = !cast<Intrinsic>("int_nvvm_f2tf32_" # !subst(".", "_", Modifier));
- def NAME : NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src),
- "cvt." # Modifier # ".tf32.f32 \t$dst, $src;",
+ def NAME : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$src),
+ "cvt." # Modifier # ".tf32.f32",
[(set i32:$dst, (Intr f32:$src))]>,
Requires<Preds>;
}
@@ -742,15 +724,12 @@ let hasSideEffects = false in {
// FP6 conversions.
foreach type = ["e2m3x2", "e3m2x2"] in {
- def CVT_ # type # _f32_sf : NVPTXInst<(outs Int16Regs:$dst),
- (ins Float32Regs:$src1,
- Float32Regs:$src2, CvtMode:$mode),
- "cvt${mode:base}.satfinite${mode:relu}."
- # type # ".f32 \t$dst, $src1, $src2;", []>;
- def CVT_f16x2_ # type : NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$src, CvtMode:$mode),
- "cvt${mode:base}${mode:relu}.f16x2."
- # type # " \t$dst, $src;", []>;
+ def CVT_ # type # _f32_sf : BasicFlagsNVPTXInst<(outs Int16Regs:$dst),
+ (ins Float32Regs:$src1, Float32Regs:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}.satfinite${mode:relu}." # type # ".f32">;
+ def CVT_f16x2_ # type : BasicFlagsNVPTXInst<(outs Int32Regs:$dst),
+ (ins Int16Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}${mode:relu}.f16x2." # type>;
}
// FP4 conversions.
@@ -772,14 +751,14 @@ let hasSideEffects = false in {
// UE8M0x2 conversions.
class CVT_f32_to_ue8m0x2<string sat = ""> :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
- "cvt${mode:base}" # sat # ".ue8m0x2.f32 \t$dst, $src1, $src2;", []>;
+ BasicFlagsNVPTXInst<(outs Int16Regs:$dst),
+ (ins Float32Regs:$src1, Float32Regs:$src2), (ins CvtMode:$mode),
+ "cvt${mode:base}" # sat # ".ue8m0x2.f32">;
class CVT_bf16x2_to_ue8m0x2<string sat = ""> :
- NVPTXInst<(outs Int16Regs:$dst),
- (ins Int32Regs:$src, CvtMode:$mode),
- "cvt${mode:base}" # sat # ".ue8m0x2.bf16x2 \t$dst, $src;", []>;
+ BasicFlagsNVPTXInst<(outs Int16Regs:$dst),
+ (ins Int32Regs:$src), (ins CvtMode:$mode),
+ "cvt${mode:base}" # sat # ".ue8m0x2.bf16x2">;
def CVT_ue8m0x2_f32 : CVT_f32_to_ue8m0x2;
def CVT_ue8m0x2_f32_sf : CVT_f32_to_ue8m0x2<".satfinite">;
@@ -787,9 +766,9 @@ let hasSideEffects = false in {
def CVT_ue8m0x2_bf16x2_sf : CVT_bf16x2_to_ue8m0x2<".satfinite">;
def CVT_bf16x2_ue8m0x2 :
- NVPTXInst<(outs Int32Regs:$dst),
- (ins Int16Regs:$src),
- "cvt.rn.bf16x2.ue8m0x2 \t$dst, $src;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst),
+ (ins Int16Regs:$src),
+ "cvt.rn.bf16x2.ue8m0x2">;
}
@@ -814,24 +793,24 @@ def : Pat<(v2f16 (build_vector (f16 (fpround_oneuse f32:$lo)),
// them within this file.
let hasSideEffects = false in {
multiclass SELP_PATTERN<string TypeStr, RegTyInfo t> {
- defvar asm_str = "selp." # TypeStr # " \t$dst, $a, $b, $p;";
+ defvar asm_str = "selp." # TypeStr;
def rr :
- NVPTXInst<(outs t.RC:$dst),
+ BasicNVPTXInst<(outs t.RC:$dst),
(ins t.RC:$a, t.RC:$b, Int1Regs:$p),
asm_str,
[(set t.Ty:$dst, (select i1:$p, t.Ty:$a, t.Ty:$b))]>;
def ri :
- NVPTXInst<(outs t.RC:$dst),
+ BasicNVPTXInst<(outs t.RC:$dst),
(ins t.RC:$a, t.Imm:$b, Int1Regs:$p),
asm_str,
[(set t.Ty:$dst, (select i1:$p, t.Ty:$a, t.ImmNode:$b))]>;
def ir :
- NVPTXInst<(outs t.RC:$dst),
+ BasicNVPTXInst<(outs t.RC:$dst),
(ins t.Imm:$a, t.RC:$b, Int1Regs:$p),
asm_str,
[(set t.Ty:$dst, (select i1:$p, t.ImmNode:$a, t.Ty:$b))]>;
def ii :
- NVPTXInst<(outs t.RC:$dst),
+ BasicNVPTXInst<(outs t.RC:$dst),
(ins t.Imm:$a, t.Imm:$b, Int1Regs:$p),
asm_str,
[(set t.Ty:$dst, (select i1:$p, t.ImmNode:$a, t.ImmNode:$b))]>;
@@ -863,11 +842,11 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
def fabs_oneuse : OneUse1<fabs>;
-def TESTINF_f32r : NVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
- "testp.infinite.f32 \t$p, $a;",
+def TESTINF_f32r : BasicNVPTXInst<(outs Int1Regs:$p), (ins Float32Regs:$a),
+ "testp.infinite.f32",
[(set i1:$p, (seteq (fabs_oneuse f32:$a), fpimm_pos_inf<f32>))]>;
-def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
- "testp.infinite.f64 \t$p, $a;",
+def TESTINF_f64r : BasicNVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
+ "testp.infinite.f64",
[(set i1:$p, (seteq (fabs_oneuse f64:$a), fpimm_pos_inf<f64>))]>;
//-----------------------------------
@@ -876,11 +855,11 @@ def TESTINF_f64r : NVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
// Template for xor masquerading as int1 arithmetic.
multiclass ADD_SUB_i1<SDNode OpNode> {
- def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- "xor.pred \t$dst, $a, $b;",
+ def _rr: BasicNVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+ "xor.pred",
[(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
- def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- "xor.pred \t$dst, $a, $b;",
+ def _ri: BasicNVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+ "xor.pred",
[(set i1:$dst, (OpNode i1:$a, (imm):$b))]>;
}
@@ -920,8 +899,8 @@ defm UREM : I3<"rem.u", urem, commutative = false>;
// This idiom implements the algorithm at
// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
multiclass ABS<ValueType T, RegisterClass RC, string SizeName> {
- def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
- !strconcat("abs", SizeName, " \t$dst, $a;"),
+ def : BasicNVPTXInst<(outs RC:$dst), (ins RC:$a),
+ "abs" # SizeName,
[(set T:$dst, (abs T:$a))]>;
}
defm ABS_16 : ABS<i16, Int16Regs, ".s16">;
@@ -944,44 +923,44 @@ def UMIN16x2 : I16x2<"min.u", umin>;
// Wide multiplication
//
def MULWIDES64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.s32">;
def MULWIDES64Imm :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.s32">;
def MULWIDES64Imm64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
- "mul.wide.s32 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.s32">;
def MULWIDEU64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ "mul.wide.u32">;
def MULWIDEU64Imm :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ "mul.wide.u32">;
def MULWIDEU64Imm64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
- "mul.wide.u32 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+ "mul.wide.u32">;
def MULWIDES32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.s16">;
def MULWIDES32Imm :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.s16">;
def MULWIDES32Imm32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- "mul.wide.s16 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.s16">;
def MULWIDEU32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+ "mul.wide.u16">;
def MULWIDEU32Imm :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+ "mul.wide.u16">;
def MULWIDEU32Imm32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- "mul.wide.u16 \t$dst, $a, $b;", []>;
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ "mul.wide.u16">;
def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
@@ -1112,25 +1091,25 @@ def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{
multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> {
def rrr:
- NVPTXInst<(outs Reg:$dst),
+ BasicNVPTXInst<(outs Reg:$dst),
(ins Reg:$a, Reg:$b, Reg:$c),
- Ptx # " \t$dst, $a, $b, $c;",
+ Ptx,
[(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), VT:$c))]>;
def rir:
- NVPTXInst<(outs Reg:$dst),
+ BasicNVPTXInst<(outs Reg:$dst),
(ins Reg:$a, Imm:$b, Reg:$c),
- Ptx # " \t$dst, $a, $b, $c;",
+ Ptx,
[(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), VT:$c))]>;
def rri:
- NVPTXInst<(outs Reg:$dst),
+ BasicNVPTXInst<(outs Reg:$dst),
(ins Reg:$a, Reg:$b, Imm:$c),
- Ptx # " \t$dst, $a, $b, $c;",
+ Ptx,
[(set VT:$dst, (add (mul_oneuse VT:$a, VT:$b), imm:$c))]>;
def rii:
- NVPTXInst<(outs Reg:$dst),
+ BasicNVPTXInst<(outs Reg:$dst),
(ins Reg:$a, Imm:$b, Imm:$c),
- Ptx # " \t$dst, $a, $b, $c;",
+ Ptx,
[(set VT:$dst, (add (mul_oneuse VT:$a, imm:$b), imm:$c))]>;
}
@@ -1141,16 +1120,16 @@ defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
}
def INEG16 :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "neg.s16 \t$dst, $src;",
+ BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "neg.s16",
[(set i16:$dst, (ineg i16:$src))]>;
def INEG32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "neg.s32 \t$dst, $src;",
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "neg.s32",
[(set i32:$dst, (ineg i32:$src))]>;
def INEG64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "neg.s64 \t$dst, $src;",
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "neg.s64",
[(set i64:$dst, (ineg i64:$src))]>;
//-----------------------------------
@@ -1195,8 +1174,8 @@ defm FEXP2_H: F2_Support_Half_BF<"ex2.approx", fexp2>;
// F16 NEG
//
class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
- NVPTXInst<(outs RC:$dst), (ins RC:$src),
- !strconcat(OpcStr, " \t$dst, $src;"),
+ BasicNVPTXInst<(outs RC:$dst), (ins RC:$src),
+ OpcStr,
[(set T:$dst, (fneg T:$src))]>,
Requires<[useFP16Math, hasPTX<60>, hasSM<53>, Pred]>;
def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Int16Regs, doF32FTZ>;
@@ -1209,8 +1188,8 @@ def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Int32Regs, True>;
//
class FNEG_BF16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
- NVPTXInst<(outs RC:$dst), (ins RC:$src),
- !strconcat(OpcStr, " \t$dst, $src;"),
+ BasicNVPTXInst<(outs RC:$dst), (ins RC:$src),
+ OpcStr,
[(set T:$dst, (fneg T:$src))]>,
Requires<[hasBF16Math, hasPTX<70>, hasSM<80>, Pred]>;
def BFNEG16_ftz : FNEG_BF16_F16X2<"neg.ftz.bf16", bf16, Int16Regs, doF32FTZ>;
@@ -1383,30 +1362,29 @@ def FDIV32ri_prec :
// FMA
//
-multiclass FMA<string OpcStr, RegTyInfo t, list<Predicate> Preds = []> {
- defvar asmstr = OpcStr # " \t$dst, $a, $b, $c;";
- def rrr : NVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b, t.RC:$c),
+multiclass FMA<string asmstr, RegTyInfo t, list<Predicate> Preds = []> {
+ def rrr : BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$a, t.RC:$b, t.RC:$c),
asmstr,
[(set t.Ty:$dst, (fma t.Ty:$a, t.Ty:$b, t.Ty:$c))]>,
Requires<Preds>;
if t.SupportsImm then {
- def rri : NVPTXInst<(outs t.RC:$dst),
+ def rri : BasicNVPTXInst<(outs t.RC:$dst),
(ins t.RC:$a, t.RC:$b, t.Imm:$c),
asmstr,
[(set t.Ty:$dst, (fma t.Ty:$a, t.Ty:$b, fpimm:$c))]>,
Requires<Preds>;
- def rir : NVPTXInst<(outs t.RC:$dst),
+ def rir : BasicNVPTXInst<(outs t.RC:$dst),
(ins t.RC:$a, t.Imm:$b, t.RC:$c),
asmstr,
[(set t.Ty:$dst, (fma t.Ty:$a, fpimm:$b, t.Ty:$c))]>,
Requires<Preds>;
- def rii : NVPTXInst<(outs t.RC:$dst),
+ def rii : BasicNVPTXInst<(outs t.RC:$dst),
(ins t.RC:$a, t.Imm:$b, t.Imm:$c),
asmstr,
[(set t.Ty:$dst, (fma t.Ty:$a, fpimm:$b, fpimm:$c))]>,
Requires<Preds>;
- def iir : NVPTXInst<(outs t.RC:$dst),
+ def iir : BasicNVPTXInst<(outs t.RC:$dst),
(ins t.Imm:$a, t.Imm:$b, t.RC:$c),
asmstr,
[(set t.Ty:$dst, (fma fpimm:$a, fpimm:$b, t.Ty:$c))]>,
@@ -1432,11 +1410,11 @@ class UnaryOpAllowsApproxFn<SDPatternOperator operator>
return allowUnsafeFPMath() || N->getFlags().hasApproximateFuncs();
}]>;
-def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
- "sin.approx.f32 \t$dst, $src;",
+def SINF: BasicNVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "sin.approx.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fsin> f32:$src))]>;
-def COSF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
- "cos.approx.f32 \t$dst, $src;",
+def COSF: BasicNVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+ "cos.approx.f32",
[(set f32:$dst, (UnaryOpAllowsApproxFn<fcos> f32:$src))]>;
//-----------------------------------
@@ -1485,17 +1463,17 @@ foreach vt = [v2i16, v4i8] in {
(ANDb32ri $a, imm:$b)>;
}
-def NOT1 : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
- "not.pred \t$dst, $src;",
+def NOT1 : BasicNVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+ "not.pred",
[(set i1:$dst, (not i1:$src))]>;
-def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
- "not.b16 \t$dst, $src;",
+def NOT16 : BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+ "not.b16",
[(set i16:$dst, (not i16:$src))]>;
-def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
- "not.b32 \t$dst, $src;",
+def NOT32 : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+ "not.b32",
[(set i32:$dst, (not i32:$src))]>;
-def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
- "not.b64 \t$dst, $src;",
+def NOT64 : BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+ "not.b64",
[(set i64:$dst, (not i64:$src))]>;
// Template for left/right shifts. Takes three operands,
@@ -1505,32 +1483,32 @@ def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
// This template also defines a 32-bit shift (imm, imm) instruction.
multiclass SHIFT<string OpcStr, SDNode OpNode> {
def i64rr :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+ OpcStr # "64",
[(set i64:$dst, (OpNode i64:$a, i32:$b))]>;
def i64ri :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+ OpcStr # "64",
[(set i64:$dst, (OpNode i64:$a, (i32 imm:$b)))]>;
def i32rr :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+ OpcStr # "32",
[(set i32:$dst, (OpNode i32:$a, i32:$b))]>;
def i32ri :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+ OpcStr # "32",
[(set i32:$dst, (OpNode i32:$a, (i32 imm:$b)))]>;
def i32ii :
- NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
- !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+ OpcStr # "32",
[(set i32:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
def i16rr :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+ OpcStr # "16",
[(set i16:$dst, (OpNode i16:$a, i32:$b))]>;
def i16ri :
- NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
- !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+ BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+ OpcStr # "16",
[(set i16:$dst, (OpNode i16:$a, (i32 imm:$b)))]>;
}
@@ -1540,12 +1518,12 @@ defm SRL : SHIFT<"shr.u", srl>;
// Bit-reverse
def BREV32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
- "brev.b32 \t$dst, $a;",
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+ "brev.b32",
[(set i32:$dst, (bitreverse i32:$a))]>;
def BREV64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
- "brev.b64 \t$dst, $a;",
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
+ "brev.b64",
[(set i64:$dst, (bitreverse i64:$a))]>;
@@ -1576,52 +1554,52 @@ def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
def rrr
- : NVPTXInst<(outs RC:$d),
+ : BasicNVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, Int32Regs:$c),
- !strconcat(Instr, " \t$d, $a, $b, $c;"),
+ Instr,
[(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>;
def rri
- : NVPTXInst<(outs RC:$d),
+ : BasicNVPTXInst<(outs RC:$d),
(ins RC:$a, Int32Regs:$b, i32imm:$c),
- !strconcat(Instr, " \t$d, $a, $b, $c;"),
+ Instr,
[(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>;
def rii
- : NVPTXInst<(outs RC:$d),
+ : BasicNVPTXInst<(outs RC:$d),
(ins RC:$a, i32imm:$b, i32imm:$c),
- !strconcat(Instr, " \t$d, $a, $b, $c;"),
+ Instr,
[(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>;
}
multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> {
def rrrr
- : NVPTXInst<(outs RC:$f),
+ : BasicNVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ Instr,
[(set T:$f, (bfi T:$a, T:$b, i32:$c, i32:$d))]>;
def rrri
- : NVPTXInst<(outs RC:$f),
+ : BasicNVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, Int32Regs:$c, i32imm:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ Instr,
[(set T:$f, (bfi T:$a, T:$b, i32:$c, imm:$d))]>;
def rrii
- : NVPTXInst<(outs RC:$f),
+ : BasicNVPTXInst<(outs RC:$f),
(ins RC:$a, RC:$b, i32imm:$c, i32imm:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ Instr,
[(set T:$f, (bfi T:$a, T:$b, imm:$c, imm:$d))]>;
def irrr
- : NVPTXInst<(outs RC:$f),
+ : BasicNVPTXInst<(outs RC:$f),
(ins ImmCls:$a, RC:$b, Int32Regs:$c, Int32Regs:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ Instr,
[(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, i32:$d))]>;
def irri
- : NVPTXInst<(outs RC:$f),
+ : BasicNVPTXInst<(outs RC:$f),
(ins ImmCls:$a, RC:$b, Int32Regs:$c, i32imm:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ Instr,
[(set T:$f, (bfi (T imm:$a), T:$b, i32:$c, imm:$d))]>;
def irii
- : NVPTXInst<(outs RC:$f),
+ : BasicNVPTXInst<(outs RC:$f),
(ins ImmCls:$a, RC:$b, i32imm:$c, i32imm:$d),
- !strconcat(Instr, " \t$f, $a, $b, $c, $d;"),
+ Instr,
[(set T:$f, (bfi (T imm:$a), T:$b, imm:$c, imm:$d))]>;
}
@@ -1711,17 +1689,14 @@ def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
let hasSideEffects = false in {
multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
def rr :
- NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
- " \t$dst, $a, $b;"), []>;
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}." # TypeStr>;
def ri :
- NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
- " \t$dst, $a, $b;"), []>;
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}." # TypeStr>;
def ir :
- NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
- !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
- " \t$dst, $a, $b;"), []>;
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}." # TypeStr>;
}
}
@@ -1737,28 +1712,26 @@ defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
def SETP_f16rr :
- NVPTXInst<(outs Int1Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
- "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
- []>, Requires<[useFP16Math]>;
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}.f16">,
+ Requires<[useFP16Math]>;
def SETP_f16x2rr :
- NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
- (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
- "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
- []>,
+ BasicFlagsNVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
+ (ins Int32Regs:$a, Int32Regs:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}.f16x2">,
Requires<[useFP16Math]>;
def SETP_bf16rr :
- NVPTXInst<(outs Int1Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b, CmpMode:$cmp),
- "setp${cmp:base}${cmp:ftz}.bf16 \t$dst, $a, $b;",
- []>, Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst),
+ (ins Int16Regs:$a, Int16Regs:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}.bf16">,
+ Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
def SETP_bf16x2rr :
- NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
- (ins Int32Regs:$a, Int32Regs:$b, CmpMode:$cmp),
- "setp${cmp:base}${cmp:ftz}.bf16x2 \t$p|$q, $a, $b;",
- []>,
+ BasicFlagsNVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
+ (ins Int32Regs:$a, Int32Regs:$b), (CmpMode:$cmp),
+ "setp${cmp:base}${cmp:ftz}.bf16x2">,
Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
//-----------------------------------
@@ -1792,11 +1765,11 @@ def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
// Load a memory address into a u32 or u64 register.
-def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins ADDR_base:$a),
- "mov.b32 \t$dst, $a;",
+def MOV_ADDR : BasicNVPTXInst<(outs Int32Regs:$dst), (ins ADDR_base:$a),
+ "mov.b32",
[(set i32:$dst, (Wrapper tglobaladdr:$a))]>;
-def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins ADDR_base:$a),
- "mov.b64 \t$dst, $a;",
+def MOV_ADDR64 : BasicNVPTXInst<(outs Int64Regs:$dst), (ins ADDR_base:$a),
+ "mov.b64",
[(set i64:$dst, (Wrapper tglobaladdr:$a))]>;
// Get pointer to local stack.
@@ -1812,13 +1785,13 @@ let hasSideEffects = false in {
let hasSideEffects = false, isAsCheapAsAMove = true in {
// Class for register-to-register moves
class MOVr<RegisterClass RC, string OpStr> :
- NVPTXInst<(outs RC:$dst), (ins RC:$src),
- "mov." # OpStr # " \t$dst, $src;", []>;
+ BasicNVPTXInst<(outs RC:$dst), (ins RC:$src),
+ "mov." # OpStr>;
// Class for immediate-to-register moves
class MOVi<RegisterClass RC, string OpStr, ValueType VT, Operand IMMType, SDNode ImmNode> :
- NVPTXInst<(outs RC:$dst), (ins IMMType:$src),
- "mov." # OpStr # " \t$dst, $src;",
+ BasicNVPTXInst<(outs RC:$dst), (ins IMMType:$src),
+ "mov." # OpStr,
[(set VT:$dst, ImmNode:$src)]>;
}
@@ -2396,8 +2369,8 @@ def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
- NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
- !strconcat("mov.", SzStr, " \t$dst, $src;"),
+ BasicNVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+ "mov." # SzStr,
[(set T:$dst, (ProxyReg T:$src))]>;
def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
@@ -2901,17 +2874,17 @@ def fshr_clamp : SDNode<"NVPTXISD::FSHR_CLAMP", SDTIntShiftDOp, []>;
let hasSideEffects = false in {
multiclass ShfInst<string mode, SDNode op> {
def _i
- : NVPTXInst<(outs Int32Regs:$dst),
+ : BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$lo, Int32Regs:$hi, i32imm:$amt),
- "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
+ "shf." # mode # ".b32",
[(set i32:$dst,
(op i32:$hi, i32:$lo, (i32 imm:$amt)))]>,
Requires<[hasHWROT32]>;
def _r
- : NVPTXInst<(outs Int32Regs:$dst),
+ : BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
- "shf." # mode # ".b32 \t$dst, $lo, $hi, $amt;",
+ "shf." # mode # ".b32",
[(set i32:$dst,
(op i32:$hi, i32:$lo, i32:$amt))]>,
Requires<[hasHWROT32]>;
@@ -2935,13 +2908,13 @@ def : Pat<(i32 (int_nvvm_fshr_clamp i32:$hi, i32:$lo, (i32 imm:$amt))),
let hasSideEffects = false in {
foreach RT = [I32RT, I64RT] in {
// Count leading zeros
- def CLZr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
- "clz.b" # RT.Size # " \t$d, $a;",
+ def CLZr # RT.Size : BasicNVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+ "clz.b" # RT.Size,
[(set i32:$d, (ctlz RT.Ty:$a))]>;
// Population count
- def POPCr # RT.Size : NVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
- "popc.b" # RT.Size # " \t$d, $a;",
+ def POPCr # RT.Size : BasicNVPTXInst<(outs Int32Regs:$d), (ins RT.RC:$a),
+ "popc.b" # RT.Size,
[(set i32:$d, (ctpop RT.Ty:$a))]>;
}
}
@@ -3028,7 +3001,7 @@ defm : CVT_ROUND<frint, CvtRNI, CvtRNI_FTZ>;
let isTerminator=1 in {
let isReturn=1, isBarrier=1 in
- def Return : NVPTXInst<(outs), (ins), "ret;", [(retglue)]>;
+ def Return : BasicNVPTXInst<(outs), (ins), "ret", [(retglue)]>;
let isBranch=1 in
def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
@@ -3039,8 +3012,8 @@ let isTerminator=1 in {
"@!$a bra \t$target;", []>;
let isBranch=1, isBarrier=1 in
- def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
- "bra.uni \t$target;", [(br bb:$target)]>;
+ def GOTO : BasicNVPTXInst<(outs), (ins brtarget:$target),
+ "bra.uni", [(br bb:$target)]>;
}
def : Pat<(brcond i32:$a, bb:$target),
@@ -3090,12 +3063,12 @@ def Callseq_End :
[(callseq_end timm:$amt1, timm:$amt2)]>;
// trap instruction
-def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>, Requires<[noPTXASUnreachableBug]>;
+def trapinst : BasicNVPTXInst<(outs), (ins), "trap", [(trap)]>, Requires<[noPTXASUnreachableBug]>;
// Emit an `exit` as well to convey to ptxas that `trap` exits the CFG.
// This won't be necessary in a future version of ptxas.
def trapexitinst : NVPTXInst<(outs), (ins), "trap; exit;", [(trap)]>, Requires<[hasPTXASUnreachableBug]>;
// brkpt instruction
-def debugtrapinst : NVPTXInst<(outs), (ins), "brkpt;", [(debugtrap)]>;
+def debugtrapinst : BasicNVPTXInst<(outs), (ins), "brkpt", [(debugtrap)]>;
// Call prototype wrapper
def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
@@ -3118,9 +3091,9 @@ def dyn_alloca :
foreach t = [I32RT, I64RT] in {
def DYNAMIC_STACKALLOC # t.Size :
- NVPTXInst<(outs t.RC:$ptr),
+ BasicNVPTXInst<(outs t.RC:$ptr),
(ins t.RC:$size, i32imm:$align),
- "alloca.u" # t.Size # " \t$ptr, $size, $align;",
+ "alloca.u" # t.Size,
[(set t.Ty:$ptr, (dyn_alloca t.Ty:$size, timm:$align))]>,
Requires<[hasPTX<73>, hasSM<52>]>;
}
@@ -3168,9 +3141,9 @@ foreach a_type = ["s", "u"] in {
foreach b_type = ["s", "u"] in {
def DOT4_ # a_type # b_type :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
- "dp4a." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
+ "dp4a." # a_type # "32." # b_type # "32",
[(set i32:$dst,
(!cast<Intrinsic>("int_nvvm_idp4a_" # a_type # "_" # b_type)
i32:$a, i32:$b, i32:$c))]>,
@@ -3180,9 +3153,9 @@ foreach a_type = ["s", "u"] in {
defvar lohi_suffix = !if(is_hi, "hi", "lo");
def DOT2_ # lohi_suffix # _ # a_type # b_type :
- NVPTXInst<(outs Int32Regs:$dst),
+ BasicNVPTXInst<(outs Int32Regs:$dst),
(ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
- "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32 \t$dst, $a, $b, $c;",
+ "dp2a." # lohi_suffix # "." # a_type # "32." # b_type # "32",
[(set i32:$dst,
(!cast<Intrinsic>("int_nvvm_idp2a_" # a_type # "_" # b_type)
i32:$a, i32:$b, is_hi, i32:$c))]>,
@@ -3206,26 +3179,26 @@ def stacksave :
[SDNPHasChain, SDNPSideEffect]>;
def STACKRESTORE_32 :
- NVPTXInst<(outs), (ins Int32Regs:$ptr),
- "stackrestore.u32 \t$ptr;",
+ BasicNVPTXInst<(outs), (ins Int32Regs:$ptr),
+ "stackrestore.u32",
[(stackrestore i32:$ptr)]>,
Requires<[hasPTX<73>, hasSM<52>]>;
def STACKSAVE_32 :
- NVPTXInst<(outs Int32Regs:$dst), (ins),
- "stacksave.u32 \t$dst;",
+ BasicNVPTXInst<(outs Int32Regs:$dst), (ins),
+ "stacksave.u32",
[(set i32:$dst, (i32 stacksave))]>,
Requires<[hasPTX<73>, hasSM<52>]>;
def STACKRESTORE_64 :
- NVPTXInst<(outs), (ins Int64Regs:$ptr),
- "stackrestore.u64 \t$ptr;",
+ BasicNVPTXInst<(outs), (ins Int64Regs:$ptr),
+ "stackrestore.u64",
[(stackrestore i64:$ptr)]>,
Requires<[hasPTX<73>, hasSM<52>]>;
def STACKSAVE_64 :
- NVPTXInst<(outs Int64Regs:$dst), (ins),
- "stacksave.u64 \t$dst;",
+ BasicNVPTXInst<(outs Int64Regs:$dst), (ins),
+ "stacksave.u64",
[(set i64:$dst, (i64 stacksave))]>,
Requires<[hasPTX<73>, hasSM<52>]>;
@@ -3300,9 +3273,8 @@ def NVPTX_fmaxnum_nsz : PatFrag<(ops node:$a, node:$b),
}]>;
class NVPTXInst_rrr<RegisterClass RC, string Instruction, list<Predicate> Preds>
- : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
- !strconcat(Instruction, "\t$dst, $a, $b, $c;"), []>,
- Requires<Preds>;
+ : BasicNVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c), Instruction>,
+ Requires<Preds>;
def FMARELU_F16 : NVPTXInst_rrr<Int16Regs, "fma.rn.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>;
def FMARELU_F16_FTZ : NVPTXInst_rrr<Int16Regs, "fma.rn.ftz.relu.f16", [useFP16Math, hasPTX<70>, hasSM<80>]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8fb5884fa2a20..5002d1bd2ca09 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -93,10 +93,10 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
"}}"),
[(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;
-def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;",
+def INT_BAR_WARP_SYNC_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync",
[(int_nvvm_bar_warp_sync imm:$i)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
-def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;",
+def INT_BAR_WARP_SYNC_R : BasicNVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync",
[(int_nvvm_bar_warp_sync i32:$i)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
@@ -234,12 +234,12 @@ defm VOTE_BALLOT : VOTE<Int32Regs, "ballot.b32", int_nvvm_vote_ballot>;
// vote.sync.{all,any,uni,ballot}
multiclass VOTE_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- def i : NVPTXInst<(outs regclass:$dest), (ins i32imm:$mask, Int1Regs:$pred),
- "vote.sync." # mode # " \t$dest, $pred, $mask;",
+ def i : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred, i32imm:$mask),
+ "vote.sync." # mode,
[(set regclass:$dest, (IntOp imm:$mask, i1:$pred))]>,
Requires<[hasPTX<60>, hasSM<30>]>;
- def r : NVPTXInst<(outs regclass:$dest), (ins Int32Regs:$mask, Int1Regs:$pred),
- "vote.sync." # mode #" \t$dest, $pred, $mask;",
+ def r : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred, Int32Regs:$mask),
+ "vote.sync." # mode,
[(set regclass:$dest, (IntOp i32:$mask, i1:$pred))]>,
Requires<[hasPTX<60>, hasSM<30>]>;
}
@@ -250,38 +250,38 @@ defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>;
defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>;
// elect.sync
-def INT_ELECT_SYNC_I : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask),
- "elect.sync \t$dest|$pred, $mask;",
+def INT_ELECT_SYNC_I : BasicNVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask),
+ "elect.sync",
[(set i32:$dest, i1:$pred, (int_nvvm_elect_sync imm:$mask))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
-def INT_ELECT_SYNC_R : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask),
- "elect.sync \t$dest|$pred, $mask;",
+def INT_ELECT_SYNC_R : BasicNVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask),
+ "elect.sync",
[(set i32:$dest, i1:$pred, (int_nvvm_elect_sync i32:$mask))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
Operand ImmOp> {
- def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),
- "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+ def ii : BasicNVPTXInst<(outs Int32Regs:$dest), (ins ImmOp:$value, i32imm:$mask),
+ "match.any.sync." # ptxtype,
[(set i32:$dest, (IntOp imm:$mask, imm:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
- def ir : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, ImmOp:$value),
- "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+ def ir : BasicNVPTXInst<(outs Int32Regs:$dest), (ins ImmOp:$value, Int32Regs:$mask),
+ "match.any.sync." # ptxtype,
[(set i32:$dest, (IntOp i32:$mask, imm:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
- def ri : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, regclass:$value),
- "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+ def ri : BasicNVPTXInst<(outs Int32Regs:$dest), (ins regclass:$value, i32imm:$mask),
+ "match.any.sync." # ptxtype,
[(set i32:$dest, (IntOp imm:$mask, regclass:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
- def rr : NVPTXInst<(outs Int32Regs:$dest), (ins Int32Regs:$mask, regclass:$value),
- "match.any.sync." # ptxtype # " \t$dest, $value, $mask;",
+ def rr : BasicNVPTXInst<(outs Int32Regs:$dest), (ins regclass:$value, Int32Regs:$mask),
+ "match.any.sync." # ptxtype,
[(set i32:$dest, (IntOp i32:$mask, regclass:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
}
// activemask.b32
-def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins),
- "activemask.b32 \t$dest;",
+def ACTIVEMASK : BasicNVPTXInst<(outs Int32Regs:$dest), (ins),
+ "activemask.b32",
[(set i32:$dest, (int_nvvm_activemask))]>,
Requires<[hasPTX<62>, hasSM<30>]>;
@@ -293,23 +293,23 @@ defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_syn
multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
Operand ImmOp> {
def ii : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
- (ins i32imm:$mask, ImmOp:$value),
- "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
+ (ins ImmOp:$value, i32imm:$mask),
+ "match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp imm:$mask, imm:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
def ir : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
- (ins Int32Regs:$mask, ImmOp:$value),
- "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
+ (ins ImmOp:$value, Int32Regs:$mask),
+ "match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp i32:$mask, imm:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
def ri : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
- (ins i32imm:$mask, regclass:$value),
- "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
+ (ins regclass:$value, i32imm:$mask),
+ "match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp imm:$mask, regclass:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
def rr : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
- (ins Int32Regs:$mask, regclass:$value),
- "match.all.sync." # ptxtype # " \t$dest|$pred, $value, $mask;",
+ (ins regclass:$value, Int32Regs:$mask),
+ "match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp i32:$mask, regclass:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
}
@@ -319,8 +319,8 @@ defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_s
i64imm>;
multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> {
- def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
- "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;",
+ def : BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
+ "redux.sync." # BinOp # "." # PTXType,
[(set i32:$dst, (Intrin i32:$src, Int32Regs:$mask))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -337,9 +337,9 @@ defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>;
multiclass REDUX_SYNC_F<string BinOp, string abs, string NaN> {
defvar intr_name = "int_nvvm_redux_sync_f" # BinOp # !subst(".", "_", abs) # !subst(".", "_", NaN);
- def : NVPTXInst<(outs Float32Regs:$dst),
+ def : BasicNVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$src, Int32Regs:$mask),
- "redux.sync." # BinOp # abs # NaN # ".f32 $dst, $src, $mask;",
+ "redux.sync." # BinOp # abs # NaN # ".f32",
[(set f32:$dst, (!cast<Intrinsic>(intr_name) f32:$src, Int32Regs:$mask))]>,
Requires<[hasPTX<86>, hasSM100a]>;
}
@@ -359,23 +359,23 @@ defm REDUX_SYNC_FMAX_ABS_NAN: REDUX_SYNC_F<"max", ".abs", ".NaN">;
// Explicit Memory Fence Functions
//-----------------------------------
class MEMBAR<string StrOp, Intrinsic IntOP> :
- NVPTXInst<(outs), (ins),
+ BasicNVPTXInst<(outs), (ins),
StrOp, [(IntOP)]>;
-def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
-def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
-def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL : MEMBAR<"membar.gl", int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys", int_nvvm_membar_sys>;
def INT_FENCE_SC_CLUSTER:
- MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
+ MEMBAR<"fence.sc.cluster", int_nvvm_fence_sc_cluster>,
Requires<[hasPTX<78>, hasSM<90>]>;
// Proxy fence (uni-directional)
// fence.proxy.tensormap.release variants
class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> :
- NVPTXInst<(outs), (ins),
- "fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>,
+ BasicNVPTXInst<(outs), (ins),
+ "fence.proxy.tensormap::generic.release." # Scope, [(Intr)]>,
Requires<[hasPTX<83>, hasSM<90>]>;
def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CTA:
@@ -481,32 +481,32 @@ defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
int_nvvm_cp_async_cg_shared_global_16_s>;
def CP_ASYNC_COMMIT_GROUP :
- NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
+ BasicNVPTXInst<(outs), (ins), "cp.async.commit_group", [(int_nvvm_cp_async_commit_group)]>,
Requires<[hasPTX<70>, hasSM<80>]>;
def CP_ASYNC_WAIT_GROUP :
- NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
+ BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group",
[(int_nvvm_cp_async_wait_group timm:$n)]>,
Requires<[hasPTX<70>, hasSM<80>]>;
def CP_ASYNC_WAIT_ALL :
- NVPTXInst<(outs), (ins), "cp.async.wait_all;",
+ BasicNVPTXInst<(outs), (ins), "cp.async.wait_all",
[(int_nvvm_cp_async_wait_all)]>,
Requires<[hasPTX<70>, hasSM<80>]>;
// cp.async.bulk variants of the commit/wait group
def CP_ASYNC_BULK_COMMIT_GROUP :
- NVPTXInst<(outs), (ins), "cp.async.bulk.commit_group;",
+ BasicNVPTXInst<(outs), (ins), "cp.async.bulk.commit_group",
[(int_nvvm_cp_async_bulk_commit_group)]>,
Requires<[hasPTX<80>, hasSM<90>]>;
def CP_ASYNC_BULK_WAIT_GROUP :
- NVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group $n;",
+ BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group",
[(int_nvvm_cp_async_bulk_wait_group timm:$n)]>,
Requires<[hasPTX<80>, hasSM<90>]>;
def CP_ASYNC_BULK_WAIT_GROUP_READ :
- NVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group.read $n;",
+ BasicNVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group.read",
[(int_nvvm_cp_async_bulk_wait_group_read timm:$n)]>,
Requires<[hasPTX<80>, hasSM<90>]>;
@@ -997,7 +997,7 @@ def : Pat<(int_nvvm_fmin_d
// INT_PTX_RECIP.
class F_MATH_1<string OpcStr, RegTyInfo dst, RegTyInfo src, Intrinsic IntOP,
list<Predicate> Preds = []>
- : NVPTXInst<(outs dst.RC:$dst),
+ : BasicNVPTXInst<(outs dst.RC:$dst),
(ins src.RC:$src0),
OpcStr,
[(set dst.Ty:$dst, (IntOP src.Ty:$src0))]>,
@@ -1008,7 +1008,7 @@ class F_MATH_1<string OpcStr, RegTyInfo dst, RegTyInfo src, Intrinsic IntOP,
class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP,
list<Predicate> Preds = []>
- : NVPTXInst<(outs t_regclass:$dst),
+ : BasicNVPTXInst<(outs t_regclass:$dst),
(ins s0_regclass:$src0, s1_regclass:$src1),
OpcStr,
[(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>,
@@ -1017,7 +1017,7 @@ class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
NVPTXRegClass s2_regclass, Intrinsic IntOP, list<Predicate> Preds = []>
- : NVPTXInst<(outs t_regclass:$dst),
+ : BasicNVPTXInst<(outs t_regclass:$dst),
(ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
OpcStr,
[(set t_regclass:$dst,
@@ -1046,73 +1046,73 @@ def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>;
def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>;
-def INT_NVVM_NANOSLEEP_I : NVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32 \t$i;",
+def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32",
[(int_nvvm_nanosleep imm:$i)]>,
Requires<[hasPTX<63>, hasSM<70>]>;
-def INT_NVVM_NANOSLEEP_R : NVPTXInst<(outs), (ins Int32Regs:$i), "nanosleep.u32 \t$i;",
+def INT_NVVM_NANOSLEEP_R : BasicNVPTXInst<(outs), (ins Int32Regs:$i), "nanosleep.u32",
[(int_nvvm_nanosleep i32:$i)]>,
Requires<[hasPTX<63>, hasSM<70>]>;
//
// Min Max
//
-def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32", Float32Regs,
Float32Regs, Float32Regs, int_nvvm_fmin_f>;
-def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
-def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_FMIN_NAN_F : F_MATH_2<"min.NaN.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_f,
[hasPTX<70>, hasSM<80>]>;
-def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_FMIN_FTZ_NAN_F : F_MATH_2<"min.ftz.NaN.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_f,
[hasPTX<70>, hasSM<80>]>;
def INT_NVVM_FMIN_XORSIGN_ABS_F :
- F_MATH_2<"min.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"min.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
def INT_NVVM_FMIN_FTZ_XORSIGN_ABS_F :
- F_MATH_2<"min.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"min.ftz.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
def INT_NVVM_FMIN_NAN_XORSIGN_ABS_F :
- F_MATH_2<"min.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"min.NaN.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_nan_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
def INT_NVVM_FMIN_FTZ_NAN_XORSIGN_ABS_F :
- F_MATH_2<"min.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"min.ftz.NaN.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_nan_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
-def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32", Float32Regs,
Float32Regs, Float32Regs, int_nvvm_fmax_f>;
-def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
-def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_FMAX_NAN_F : F_MATH_2<"max.NaN.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_f,
[hasPTX<70>, hasSM<80>]>;
-def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_FMAX_FTZ_NAN_F : F_MATH_2<"max.ftz.NaN.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_f,
[hasPTX<70>, hasSM<80>]>;
def INT_NVVM_FMAX_XORSIGN_ABS_F :
- F_MATH_2<"max.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"max.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
def INT_NVVM_FMAX_FTZ_XORSIGN_ABS_F :
- F_MATH_2<"max.ftz.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"max.ftz.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
def INT_NVVM_FMAX_NAN_XORSIGN_ABS_F :
- F_MATH_2<"max.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"max.NaN.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_nan_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
def INT_NVVM_FMAX_FTZ_NAN_XORSIGN_ABS_F :
- F_MATH_2<"max.ftz.NaN.xorsign.abs.f32 \t$dst, $src0, $src1;",
+ F_MATH_2<"max.ftz.NaN.xorsign.abs.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_nan_xorsign_abs_f,
[hasPTX<72>, hasSM<86>]>;
-def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64", Float64Regs,
Float64Regs, Float64Regs, int_nvvm_fmin_d>;
-def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64", Float64Regs,
Float64Regs, Float64Regs, int_nvvm_fmax_d>;
//
@@ -1192,7 +1192,7 @@ multiclass MIN_MAX<string IntName> {
int_nvvm_fmax_nan_xorsign_abs_bf16x2),
Int32Regs, [hasPTX<72>, hasSM<86>]>] in {
def P.Variant : F_MATH_2<!strconcat(
- IntName, !subst("_", ".", P.Variant), " \t$dst, $src0, $src1;"),
+ IntName, !subst("_", ".", P.Variant)),
P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
}
}
@@ -1204,48 +1204,48 @@ defm INT_NVVM_FMAN : MIN_MAX<"max">;
// Multiplication
//
-def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16 \t$dst, $src0, $src1;", Int16Regs,
+def INT_NVVM_MULHI_S : F_MATH_2<"mul.hi.s16", Int16Regs,
Int16Regs, Int16Regs, int_nvvm_mulhi_s>;
-def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16 \t$dst, $src0, $src1;", Int16Regs,
+def INT_NVVM_MULHI_US : F_MATH_2<"mul.hi.u16", Int16Regs,
Int16Regs, Int16Regs, int_nvvm_mulhi_us>;
-def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32", Int32Regs,
Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
-def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32", Int32Regs,
Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
-def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64", Int64Regs,
Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
-def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64", Int64Regs,
Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
-def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
-def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
-def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
-def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
-def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
-def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
-def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
-def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
-def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
-def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
-def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
-def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
-def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32",
Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
-def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32",
Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
//
@@ -1253,35 +1253,35 @@ def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
//
def INT_NVVM_DIV_APPROX_FTZ_F
- : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+ : F_MATH_2<"div.approx.ftz.f32", Float32Regs,
Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
-def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
-def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
-def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RN_F : F_MATH_2<"div.rn.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
-def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
-def INT_NVVM_DIV_RZ_F : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RZ_F : F_MATH_2<"div.rz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
-def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
-def INT_NVVM_DIV_RM_F : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RM_F : F_MATH_2<"div.rm.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
-def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
-def INT_NVVM_DIV_RP_F : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RP_F : F_MATH_2<"div.rp.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
-def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
-def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
-def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
-def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
def : Pat<(int_nvvm_div_full f32:$a, f32:$b),
@@ -1300,17 +1300,17 @@ def : Pat<(int_nvvm_div_full_ftz f32:$a, fpimm:$b),
// Sad
//
-def INT_NVVM_SAD_S : F_MATH_3<"sad.s16 \t$dst, $src0, $src1, $src2;",
+def INT_NVVM_SAD_S : F_MATH_3<"sad.s16",
Int16Regs, Int16Regs, Int16Regs, Int16Regs, int_nvvm_sad_s>;
-def INT_NVVM_SAD_US : F_MATH_3<"sad.u16 \t$dst, $src0, $src1, $src2;",
+def INT_NVVM_SAD_US : F_MATH_3<"sad.u16",
Int16Regs, Int16Regs, Int16Regs, Int16Regs, int_nvvm_sad_us>;
-def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32",
Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
-def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32",
Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
-def INT_NVVM_SAD_LL : F_MATH_3<"sad.s64 \t$dst, $src0, $src1, $src2;",
+def INT_NVVM_SAD_LL : F_MATH_3<"sad.s64",
Int64Regs, Int64Regs, Int64Regs, Int64Regs, int_nvvm_sad_ll>;
-def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64 \t$dst, $src0, $src1, $src2;",
+def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64",
Int64Regs, Int64Regs, Int64Regs, Int64Regs, int_nvvm_sad_ull>;
//
@@ -1336,9 +1336,9 @@ def : Pat<(int_nvvm_ceil_d f64:$a),
//
multiclass F_ABS<string suffix, RegTyInfo RT, bit support_ftz, list<Predicate> preds = []> {
- def "" : F_MATH_1<"abs." # suffix # " \t$dst, $src0;", RT, RT, int_nvvm_fabs, preds>;
+ def "" : F_MATH_1<"abs." # suffix, RT, RT, int_nvvm_fabs, preds>;
if support_ftz then
- def _FTZ : F_MATH_1<"abs.ftz." # suffix # " \t$dst, $src0;", RT, RT, int_nvvm_fabs_ftz, preds>;
+ def _FTZ : F_MATH_1<"abs.ftz." # suffix, RT, RT, int_nvvm_fabs_ftz, preds>;
}
defm ABS_F16 : F_ABS<"f16", F16RT, support_ftz = true, preds = [hasPTX<65>, hasSM<53>]>;
@@ -1357,22 +1357,22 @@ defm ABS_F64 : F_ABS<"f64", F64RT, support_ftz = false>;
def fcopysign_nvptx : SDNode<"NVPTXISD::FCOPYSIGN", SDTFPBinOp>;
def COPYSIGN_F :
- NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src0, Float32Regs:$src1),
- "copysign.f32 \t$dst, $src0, $src1;",
+ BasicNVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src0, Float32Regs:$src1),
+ "copysign.f32",
[(set f32:$dst, (fcopysign_nvptx f32:$src1, f32:$src0))]>;
def COPYSIGN_D :
- NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src0, Float64Regs:$src1),
- "copysign.f64 \t$dst, $src0, $src1;",
+ BasicNVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src0, Float64Regs:$src1),
+ "copysign.f64",
[(set f64:$dst, (fcopysign_nvptx f64:$src1, f64:$src0))]>;
//
// Neg bf16, bf16x2
//
-def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", BF16RT,
+def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16", BF16RT,
BF16RT, int_nvvm_neg_bf16, [hasPTX<70>, hasSM<80>]>;
-def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", BF16X2RT,
+def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2", BF16X2RT,
BF16X2RT, int_nvvm_neg_bf16x2, [hasPTX<70>, hasSM<80>]>;
//
@@ -1412,16 +1412,16 @@ def : Pat<(int_nvvm_saturate_d f64:$a),
// Exp2 Log2
//
-def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32",
F32RT, F32RT, int_nvvm_ex2_approx_ftz_f>;
-def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32",
F32RT, F32RT, int_nvvm_ex2_approx_f>;
-def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64",
F64RT, F64RT, int_nvvm_ex2_approx_d>;
-def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;",
+def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16",
F16RT, F16RT, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>;
-def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;",
+def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2",
F16X2RT, F16X2RT, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
def : Pat<(fexp2 f32:$a),
@@ -1433,11 +1433,11 @@ def : Pat<(fexp2 f16:$a),
def : Pat<(fexp2 v2f16:$a),
(INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>;
-def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32",
F32RT, F32RT, int_nvvm_lg2_approx_ftz_f>;
-def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32",
F32RT, F32RT, int_nvvm_lg2_approx_f>;
-def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64",
F64RT, F64RT, int_nvvm_lg2_approx_d>;
def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_FTZ_F $a)>,
@@ -1449,14 +1449,14 @@ def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_F $a)>,
// Sin Cos
//
-def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32",
F32RT, F32RT, int_nvvm_sin_approx_ftz_f>;
-def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32",
F32RT, F32RT, int_nvvm_sin_approx_f>;
-def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32",
F32RT, F32RT, int_nvvm_cos_approx_ftz_f>;
-def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32",
F32RT, F32RT, int_nvvm_cos_approx_f>;
//
@@ -1529,8 +1529,7 @@ multiclass FMA_INST {
[hasPTX<70>, hasSM<80>]>
] in {
def P.Variant :
- F_MATH_3<!strconcat("fma",
- !subst("_", ".", P.Variant), " \t$dst, $src0, $src1, $src2;"),
+ F_MATH_3<!strconcat("fma", !subst("_", ".", P.Variant)),
P.RegClass, P.RegClass, P.RegClass, P.RegClass, P.Intr, P.Predicates>;
}
}
@@ -1541,69 +1540,69 @@ defm INT_NVVM_FMA : FMA_INST;
// Rcp
//
-def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32",
F32RT, F32RT, int_nvvm_rcp_rn_ftz_f>;
-def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32",
F32RT, F32RT, int_nvvm_rcp_rn_f>;
-def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32",
F32RT, F32RT, int_nvvm_rcp_rz_ftz_f>;
-def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32",
F32RT, F32RT, int_nvvm_rcp_rz_f>;
-def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32",
F32RT, F32RT, int_nvvm_rcp_rm_ftz_f>;
-def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32",
F32RT, F32RT, int_nvvm_rcp_rm_f>;
-def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32",
F32RT, F32RT, int_nvvm_rcp_rp_ftz_f>;
-def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32",
F32RT, F32RT, int_nvvm_rcp_rp_f>;
-def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64", F64RT,
F64RT, int_nvvm_rcp_rn_d>;
-def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64", F64RT,
F64RT, int_nvvm_rcp_rz_d>;
-def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64", F64RT,
F64RT, int_nvvm_rcp_rm_d>;
-def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64", F64RT,
F64RT, int_nvvm_rcp_rp_d>;
-def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32",
F32RT, F32RT, int_nvvm_rcp_approx_ftz_f>;
-def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64",
F64RT, F64RT, int_nvvm_rcp_approx_ftz_d>;
//
// Sqrt
//
-def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32",
F32RT, F32RT, int_nvvm_sqrt_rn_ftz_f>;
-def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", F32RT,
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32", F32RT,
F32RT, int_nvvm_sqrt_rn_f>;
-def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32",
F32RT, F32RT, int_nvvm_sqrt_rz_ftz_f>;
-def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", F32RT,
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32", F32RT,
F32RT, int_nvvm_sqrt_rz_f>;
-def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32",
F32RT, F32RT, int_nvvm_sqrt_rm_ftz_f>;
-def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", F32RT,
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32", F32RT,
F32RT, int_nvvm_sqrt_rm_f>;
-def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32",
F32RT, F32RT, int_nvvm_sqrt_rp_ftz_f>;
-def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", F32RT,
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32", F32RT,
F32RT, int_nvvm_sqrt_rp_f>;
-def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32",
F32RT, F32RT, int_nvvm_sqrt_approx_ftz_f>;
-def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32",
F32RT, F32RT, int_nvvm_sqrt_approx_f>;
-def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64", F64RT,
F64RT, int_nvvm_sqrt_rn_d>;
-def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64", F64RT,
F64RT, int_nvvm_sqrt_rz_d>;
-def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64", F64RT,
F64RT, int_nvvm_sqrt_rm_d>;
-def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", F64RT,
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64", F64RT,
F64RT, int_nvvm_sqrt_rp_d>;
// nvvm_sqrt intrinsic
@@ -1621,15 +1620,13 @@ def : Pat<(int_nvvm_sqrt_f f32:$a),
//
def INT_NVVM_RSQRT_APPROX_FTZ_F
- : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", F32RT, F32RT,
- int_nvvm_rsqrt_approx_ftz_f>;
+ : F_MATH_1<"rsqrt.approx.ftz.f32", F32RT, F32RT, int_nvvm_rsqrt_approx_ftz_f>;
def INT_NVVM_RSQRT_APPROX_FTZ_D
- : F_MATH_1<"rsqrt.approx.ftz.f64 \t$dst, $src0;", F64RT, F64RT,
- int_nvvm_rsqrt_approx_ftz_d>;
+ : F_MATH_1<"rsqrt.approx.ftz.f64", F64RT, F64RT, int_nvvm_rsqrt_approx_ftz_d>;
-def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32",
F32RT, F32RT, int_nvvm_rsqrt_approx_f>;
-def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64",
F64RT, F64RT, int_nvvm_rsqrt_approx_d>;
// 1.0f / sqrt_approx -> rsqrt_approx
@@ -1657,30 +1654,30 @@ def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
// Add
//
-def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
-def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
-def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
-def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
-def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
-def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
-def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
-def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32",
Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
-def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
-def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
-def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
-def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64",
Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
//
@@ -1691,13 +1688,13 @@ foreach t = [I32RT, I64RT] in {
foreach sign = ["s", "u"] in {
defvar flo_intrin = !cast<Intrinsic>("int_nvvm_flo_" # sign);
def BFIND_ # sign # t.Size
- : NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
- "bfind." # sign # t.Size # " \t$dst, $src;",
+ : BasicNVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
+ "bfind." # sign # t.Size,
[(set i32:$dst, (flo_intrin t.Ty:$src, 0))]>;
def BFIND_SHIFTAMT_ # sign # t.Size
- : NVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
- "bfind.shiftamt." # sign # t.Size # " \t$dst, $src;",
+ : BasicNVPTXInst<(outs Int32Regs:$dst), (ins t.RC:$src),
+ "bfind.shiftamt." # sign # t.Size,
[(set i32:$dst, (flo_intrin t.Ty:$src, -1))]>;
}
}
@@ -1865,21 +1862,12 @@ def : Pat<(int_nvvm_f2bf16_rz f32:$a),
def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a),
(CVT_bf16_f32 $a, CvtRZ_RELU)>;
-def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
- Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+def : Pat<(int_nvvm_lohi_i2d i32:$a i32:$b), (V2I32toI64 $a, $b)>;
+def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>;
+def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H $a)>;
-def INT_NVVM_D2I_LO : F_MATH_1<
- !strconcat("{{\n\t",
- ".reg .b32 %temp; \n\t",
- "mov.b64 \t{$dst, %temp}, $src0;\n\t",
- "}}"),
- I32RT, F64RT, int_nvvm_d2i_lo>;
-def INT_NVVM_D2I_HI : F_MATH_1<
- !strconcat("{{\n\t",
- ".reg .b32 %temp; \n\t",
- "mov.b64 \t{%temp, $dst}, $src0;\n\t",
- "}}"),
- I32RT, F64RT, int_nvvm_d2i_hi>;
+def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
+def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a),
(CVT_s64_f32 $a, CvtRNI_FTZ)>;
@@ -2077,8 +2065,8 @@ def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a),
//
class INT_FNS_MBO<dag ins, dag Operands>
- : NVPTXInst<(outs Int32Regs:$dst), ins,
- "fns.b32 \t$dst, $mask, $base, $offset;",
+ : BasicNVPTXInst<(outs Int32Regs:$dst), ins,
+ "fns.b32",
[(set i32:$dst, Operands)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
@@ -2464,20 +2452,20 @@ def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
if Supports32 then
- def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "cvta." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
+ def "" : BasicNVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "cvta." # Str # ".u32", []>, Requires<Preds>;
- def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "cvta." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
+ def _64 : BasicNVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "cvta." # Str # ".u64", []>, Requires<Preds>;
}
multiclass G_TO_NG<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
if Supports32 then
- def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "cvta.to." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
+ def "" : BasicNVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+ "cvta.to." # Str # ".u32", []>, Requires<Preds>;
- def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
+ def _64 : BasicNVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+ "cvta.to." # Str # ".u64", []>, Requires<Preds>;
}
foreach space = ["local", "shared", "global", "const", "param"] in {
@@ -2490,32 +2478,32 @@ defm cvta_to_shared_cluster : G_TO_NG<"shared::cluster", false, [hasClusters]>;
// nvvm.move intrinsicc
-def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
- "mov.b16 \t$r, $s;",
+def nvvm_move_i16 : BasicNVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+ "mov.b16",
[(set i16:$r,
(int_nvvm_move_i16 i16:$s))]>;
-def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
- "mov.b32 \t$r, $s;",
+def nvvm_move_i32 : BasicNVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+ "mov.b32",
[(set i32:$r,
(int_nvvm_move_i32 i32:$s))]>;
-def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
- "mov.b64 \t$r, $s;",
+def nvvm_move_i64 : BasicNVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+ "mov.b64",
[(set i64:$r,
(int_nvvm_move_i64 i64:$s))]>;
-def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
- "mov.f32 \t$r, $s;",
+def nvvm_move_float : BasicNVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+ "mov.f32",
[(set f32:$r,
(int_nvvm_move_float f32:$s))]>;
-def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
- "mov.f64 \t$r, $s;",
+def nvvm_move_double : BasicNVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+ "mov.f64",
[(set f64:$r,
(int_nvvm_move_double f64:$s))]>;
-def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
- "mov.u32 \t$r, $s;",
+def nvvm_move_ptr32 : BasicNVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+ "mov.u32",
[(set i32:$r,
(int_nvvm_move_ptr i32:$s))]>;
-def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
- "mov.u64 \t$r, $s;",
+def nvvm_move_ptr64 : BasicNVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+ "mov.u64",
[(set i64:$r,
(int_nvvm_move_ptr i64:$s))]>;
@@ -2531,8 +2519,7 @@ def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins ADDR_base:$s),
(int_nvvm_move_ptr texternalsym:$s))]>;*/
def texsurf_handles
- : NVPTXInst<(outs Int64Regs:$result), (ins ADDR_base:$src),
- "mov.u64 \t$result, $src;", []>;
+ : BasicNVPTXInst<(outs Int64Regs:$result), (ins ADDR_base:$src), "mov.u64">;
//-----------------------------------
// Compiler Error Warn
@@ -2556,12 +2543,12 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
// isspacep
multiclass ISSPACEP<string suffix, Intrinsic Intr, list<Predicate> Preds = []> {
- def _32: NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
- "isspacep." # suffix # "\t$d, $a;",
+ def _32: BasicNVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+ "isspacep." # suffix,
[(set i1:$d, (Intr i32:$a))]>,
Requires<Preds>;
- def _64: NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
- "isspacep." # suffix # "\t$d, $a;",
+ def _64: BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+ "isspacep." # suffix,
[(set i1:$d, (Intr i64:$a))]>,
Requires<Preds>;
}
@@ -2575,9 +2562,9 @@ defm isspace_shared_cluster : ISSPACEP<"shared::cluster",
[hasPTX<78>, hasSM<90>]>;
// Special register reads
-def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
+def MOV_SPECIAL : BasicNVPTXInst<(outs Int32Regs:$d),
(ins SpecialRegs:$r),
- "mov.b32 \t$d, $r;", []>;
+ "mov.b32", []>;
def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
@@ -7117,20 +7104,20 @@ foreach mma = !listconcat(MMAs, WMMAs, MMA_LDSTs, LDMATRIXs) in
def : MMA_PAT<mma>;
multiclass MAPA<string suffix, Intrinsic Intr> {
- def _32: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, Int32Regs:$b),
- "mapa" # suffix # ".u32\t$d, $a, $b;",
+ def _32: BasicNVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, Int32Regs:$b),
+ "mapa" # suffix # ".u32",
[(set i32:$d, (Intr i32:$a, i32:$b))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
- def _32i: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, i32imm:$b),
- "mapa" # suffix # ".u32\t$d, $a, $b;",
+ def _32i: BasicNVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a, i32imm:$b),
+ "mapa" # suffix # ".u32",
[(set i32:$d, (Intr i32:$a, imm:$b))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
- def _64: NVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, Int32Regs:$b),
- "mapa" # suffix # ".u64\t$d, $a, $b;",
+ def _64: BasicNVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, Int32Regs:$b),
+ "mapa" # suffix # ".u64",
[(set i64:$d, (Intr i64:$a, i32:$b))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
- def _64i: NVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, i32imm:$b),
- "mapa" # suffix # ".u64\t$d, $a, $b;",
+ def _64i: BasicNVPTXInst<(outs Int64Regs:$d), (ins Int64Regs:$a, i32imm:$b),
+ "mapa" # suffix # ".u64",
[(set i64:$d, (Intr i64:$a, imm:$b))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
}
@@ -7140,12 +7127,12 @@ defm mapa_shared_cluster : MAPA<".shared::cluster", int_nvvm_mapa_shared_cluste
multiclass GETCTARANK<string suffix, Intrinsic Intr> {
- def _32: NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
- "getctarank" # suffix # ".u32\t$d, $a;",
+ def _32: BasicNVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+ "getctarank" # suffix # ".u32",
[(set i32:$d, (Intr i32:$a))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
- def _64: NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
- "getctarank" # suffix # ".u64\t$d, $a;",
+ def _64: BasicNVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+ "getctarank" # suffix # ".u64",
[(set i32:$d, (Intr i64:$a))]>,
Requires<[hasSM<90>, hasPTX<78>]>;
}
@@ -7161,8 +7148,8 @@ def is_explicit_cluster: NVPTXInst<(outs Int1Regs:$d), (ins),
// setmaxnreg inc/dec intrinsics
let isConvergent = true in {
multiclass SET_MAXNREG<string Action, Intrinsic Intr> {
- def : NVPTXInst<(outs), (ins i32imm:$reg_count),
- "setmaxnreg." # Action # ".sync.aligned.u32 $reg_count;",
+ def : BasicNVPTXInst<(outs), (ins i32imm:$reg_count),
+ "setmaxnreg." # Action # ".sync.aligned.u32",
[(Intr timm:$reg_count)]>,
Requires<[hasArchAccelFeatures, hasSM<90>, hasPTX<80>]>;
}
@@ -7176,29 +7163,29 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align
// WGMMA fence instructions
//
let isConvergent = true in {
-def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : NVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned;",
+def INT_NVVM_WGMMA_FENCE_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.fence.sync.aligned",
[(int_nvvm_wgmma_fence_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
-def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : NVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned;",
+def INT_NVVM_WGMMA_COMMIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins), "wgmma.commit_group.sync.aligned",
[(int_nvvm_wgmma_commit_group_sync_aligned)]>, Requires<[hasSM90a, hasPTX<80>]>;
-def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : NVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned \t$n;",
+def INT_NVVM_WGMMA_WAIT_GROUP_SYNC_ALIGNED : BasicNVPTXInst<(outs), (ins i64imm:$n), "wgmma.wait_group.sync.aligned",
[(int_nvvm_wgmma_wait_group_sync_aligned timm:$n)]>, Requires<[hasSM90a, hasPTX<80>]>;
} // isConvergent = true
def GRIDDEPCONTROL_LAUNCH_DEPENDENTS :
- NVPTXInst<(outs), (ins),
- "griddepcontrol.launch_dependents;",
+ BasicNVPTXInst<(outs), (ins),
+ "griddepcontrol.launch_dependents",
[(int_nvvm_griddepcontrol_launch_dependents)]>,
Requires<[hasSM<90>, hasPTX<78>]>;
def GRIDDEPCONTROL_WAIT :
- NVPTXInst<(outs), (ins),
- "griddepcontrol.wait;",
+ BasicNVPTXInst<(outs), (ins),
+ "griddepcontrol.wait",
[(int_nvvm_griddepcontrol_wait)]>,
Requires<[hasSM<90>, hasPTX<78>]>;
-def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;
+def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>;
// Tcgen05 intrinsics
let isConvergent = true in {
@@ -7221,9 +7208,9 @@ defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1",
defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> {
- def NAME : NVPTXInst<(outs),
+ def NAME : BasicNVPTXInst<(outs),
(ins Int32Regs:$tmem_addr, Int32Regs:$ncols),
- !strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"),
+ "tcgen05.dealloc.cta_group::" # num # ".sync.aligned.b32",
[(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>,
Requires<[hasTcgen05Instructions]>;
}
@@ -7231,19 +7218,19 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1
defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
- def NAME : NVPTXInst<(outs), (ins),
- !strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"),
+ def NAME : BasicNVPTXInst<(outs), (ins),
+ "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned",
[(Intr)]>,
Requires<[hasTcgen05Instructions]>;
}
defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
-def tcgen05_wait_ld: NVPTXInst<(outs), (ins), "tcgen05.wait::ld.sync.aligned;",
+def tcgen05_wait_ld: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::ld.sync.aligned",
[(int_nvvm_tcgen05_wait_ld)]>,
Requires<[hasTcgen05Instructions]>;
-def tcgen05_wait_st: NVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligned;",
+def tcgen05_wait_st: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligned",
[(int_nvvm_tcgen05_wait_st)]>,
Requires<[hasTcgen05Instructions]>;
@@ -7318,13 +7305,13 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in {
let hasSideEffects = 1 in {
-def tcgen05_fence_before_thread_sync: NVPTXInst<(outs), (ins),
- "tcgen05.fence::before_thread_sync;",
+def tcgen05_fence_before_thread_sync: BasicNVPTXInst<(outs), (ins),
+ "tcgen05.fence::before_thread_sync",
[(int_nvvm_tcgen05_fence_before_thread_sync)]>,
Requires<[hasTcgen05Instructions]>;
-def tcgen05_fence_after_thread_sync: NVPTXInst<(outs), (ins),
- "tcgen05.fence::after_thread_sync;",
+def tcgen05_fence_after_thread_sync: BasicNVPTXInst<(outs), (ins),
+ "tcgen05.fence::after_thread_sync",
[(int_nvvm_tcgen05_fence_after_thread_sync)]>,
Requires<[hasTcgen05Instructions]>;
>From ec8de6f1d51f816541b8f72a51c5504b03705cef Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 27 May 2025 00:22:01 +0000
Subject: [PATCH 3/6] misc. cleanup
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 436 +++++++++--------------
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 6 +-
2 files changed, 164 insertions(+), 278 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8e67aef76dced..d07d79fb645fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -853,20 +853,6 @@ def TESTINF_f64r : BasicNVPTXInst<(outs Int1Regs:$p), (ins Float64Regs:$a),
// Integer Arithmetic
//-----------------------------------
-// Template for xor masquerading as int1 arithmetic.
-multiclass ADD_SUB_i1<SDNode OpNode> {
- def _rr: BasicNVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
- "xor.pred",
- [(set i1:$dst, (OpNode i1:$a, i1:$b))]>;
- def _ri: BasicNVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
- "xor.pred",
- [(set i1:$dst, (OpNode i1:$a, (imm):$b))]>;
-}
-
-// int1 addition and subtraction are both just xor.
-defm ADD_i1 : ADD_SUB_i1<add>;
-defm SUB_i1 : ADD_SUB_i1<sub>;
-
// int16, int32, and int64 signed addition. Since nvptx is 2's complement, we
// also use these for unsigned arithmetic.
defm ADD : I3<"add.s", add, commutative = true>;
@@ -1438,6 +1424,11 @@ defm XOR : BITWISE<"xor", xor>;
def : Pat<(mul i1:$a, i1:$b), (ANDb1rr $a, $b)>;
def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
+foreach op = [add, sub] in {
+ def : Pat<(op i1:$a, i1:$b), (XORb1rr $a, $b)>;
+ def : Pat<(op i1:$a, imm:$b), (XORb1ri $a, imm:$b)>;
+}
+
// These transformations were once reliably performed by instcombine, but thanks
// to poison semantics they are no longer safe for LLVM IR, perform them here
// instead.
@@ -1446,12 +1437,9 @@ def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
foreach vt = [v2i16, v4i8] in {
- def: Pat<(or vt:$a, vt:$b),
- (ORb32rr $a, $b)>;
- def: Pat<(xor vt:$a, vt:$b),
- (XORb32rr $a, $b)>;
- def: Pat<(and vt:$a, vt:$b),
- (ANDb32rr $a, $b)>;
+ def : Pat<(or vt:$a, vt:$b), (ORb32rr $a, $b)>;
+ def : Pat<(xor vt:$a, vt:$b), (XORb32rr $a, $b)>;
+ def : Pat<(and vt:$a, vt:$b), (ANDb32rr $a, $b)>;
// The constants get legalized into a bitcast from i32, so that's what we need
// to match here.
@@ -1689,13 +1677,13 @@ def : Pat<(i16 (sext_inreg (trunc (srl i64:$s, (i32 imm:$o))), i8)),
let hasSideEffects = false in {
multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
def rr :
- BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b), (CmpMode:$cmp),
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}." # TypeStr>;
def ri :
- BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b), (CmpMode:$cmp),
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}." # TypeStr>;
def ir :
- BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b), (CmpMode:$cmp),
+ BasicFlagsNVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}." # TypeStr>;
}
}
@@ -1713,24 +1701,24 @@ defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
def SETP_f16rr :
BasicFlagsNVPTXInst<(outs Int1Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b), (CmpMode:$cmp),
+ (ins Int16Regs:$a, Int16Regs:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}.f16">,
Requires<[useFP16Math]>;
def SETP_f16x2rr :
BasicFlagsNVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
- (ins Int32Regs:$a, Int32Regs:$b), (CmpMode:$cmp),
+ (ins Int32Regs:$a, Int32Regs:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}.f16x2">,
Requires<[useFP16Math]>;
def SETP_bf16rr :
BasicFlagsNVPTXInst<(outs Int1Regs:$dst),
- (ins Int16Regs:$a, Int16Regs:$b), (CmpMode:$cmp),
+ (ins Int16Regs:$a, Int16Regs:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}.bf16">,
Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
def SETP_bf16x2rr :
BasicFlagsNVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
- (ins Int32Regs:$a, Int32Regs:$b), (CmpMode:$cmp),
+ (ins Int32Regs:$a, Int32Regs:$b), (ins CmpMode:$cmp),
"setp${cmp:base}${cmp:ftz}.bf16x2">,
Requires<[hasBF16Math, hasPTX<78>, hasSM<90>]>;
@@ -2348,14 +2336,13 @@ def DeclareScalarRegInst :
".reg .b$size param$a;",
[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
-class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
- string asmstr> :
- NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
- !strconcat("mov", asmstr, " \t$dst, $src;"),
- [(set vt:$dst, (MoveParam texternalsym:$src))]>;
+class MoveParamSymbolInst<RegTypeInfo t> :
+ BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
+ "mov.b" # t.Size,
+ [(set t.Ty:$dst, (MoveParam texternalsym:$src))]>;
-def MOV64_PARAM : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
-def MOV32_PARAM : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
+def MOV64_PARAM : MoveParamSymbolInst<I64RT>;
+def MOV32_PARAM : MoveParamSymbolInst<I32RT>;
class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
NVPTXInst<(outs), (ins regclass:$src),
@@ -2368,24 +2355,19 @@ def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
-class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
+class ProxyRegInst<string SzStr, NVPTXRegClass regclass> :
BasicNVPTXInst<(outs regclass:$dst), (ins regclass:$src),
- "mov." # SzStr,
- [(set T:$dst, (ProxyReg T:$src))]>;
-
-def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
-def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>;
-def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>;
-def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>;
-def ProxyRegF32 : ProxyRegInst<"b32", f32, Float32Regs>;
-def ProxyRegF64 : ProxyRegInst<"b64", f64, Float64Regs>;
+ "mov." # SzStr>;
-foreach vt = [f16, bf16] in {
- def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 $src)>;
-}
+def ProxyRegB1 : ProxyRegInst<"pred", Int1Regs>;
+def ProxyRegB16 : ProxyRegInst<"b16", Int16Regs>;
+def ProxyRegB32 : ProxyRegInst<"b32", Int32Regs>;
+def ProxyRegB64 : ProxyRegInst<"b64", Int64Regs>;
-foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
- def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 $src)>;
+foreach rc = [Int1Regs, Int16Regs, Int32Regs, Int64Regs] in {
+ defvar ProxyRegInst = cast<NVPTXInst>(ProxyRegB # rc.Size);
+ foreach vt = rc.RegTypes in
+ def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegInst $src)>;
}
//
@@ -2503,249 +2485,169 @@ foreach rc = [Int16Regs, Int32Regs, Int64Regs] in
// and then cvt to floating-point.
// sint -> f16
-def : Pat<(f16 (sint_to_fp i1:$a)),
- (CVT_f16_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
-def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
- (CVT_f16_s16 $a, CvtRN)>;
-def : Pat<(f16 (sint_to_fp i32:$a)),
- (CVT_f16_s32 $a, CvtRN)>;
-def : Pat<(f16 (sint_to_fp i64:$a)),
- (CVT_f16_s64 $a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp i1:$a)), (CVT_f16_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
+def : Pat<(f16 (sint_to_fp i16:$a)), (CVT_f16_s16 $a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp i32:$a)), (CVT_f16_s32 $a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp i64:$a)), (CVT_f16_s64 $a, CvtRN)>;
// uint -> f16
-def : Pat<(f16 (uint_to_fp i1:$a)),
- (CVT_f16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
-def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
- (CVT_f16_u16 $a, CvtRN)>;
-def : Pat<(f16 (uint_to_fp i32:$a)),
- (CVT_f16_u32 $a, CvtRN)>;
-def : Pat<(f16 (uint_to_fp i64:$a)),
- (CVT_f16_u64 $a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp i1:$a)), (CVT_f16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+def : Pat<(f16 (uint_to_fp i16:$a)), (CVT_f16_u16 $a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp i32:$a)), (CVT_f16_u32 $a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp i64:$a)), (CVT_f16_u64 $a, CvtRN)>;
// sint -> bf16
-def : Pat<(bf16 (sint_to_fp i1:$a)),
- (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i16:$a)),
- (CVT_bf16_s16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i32:$a)),
- (CVT_bf16_s32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i64:$a)),
- (CVT_bf16_s64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
// uint -> bf16
-def : Pat<(bf16 (uint_to_fp i1:$a)),
- (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i16:$a)),
- (CVT_bf16_u16 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i32:$a)),
- (CVT_bf16_u32 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i64:$a)),
- (CVT_bf16_u64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
// sint -> f32
-def : Pat<(f32 (sint_to_fp i1:$a)),
- (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
-def : Pat<(f32 (sint_to_fp i16:$a)),
- (CVT_f32_s16 $a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp i32:$a)),
- (CVT_f32_s32 $a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp i64:$a)),
- (CVT_f32_s64 $a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp i1:$a)), (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp i16:$a)), (CVT_f32_s16 $a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp i32:$a)), (CVT_f32_s32 $a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp i64:$a)), (CVT_f32_s64 $a, CvtRN)>;
// uint -> f32
-def : Pat<(f32 (uint_to_fp i1:$a)),
- (CVT_f32_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
-def : Pat<(f32 (uint_to_fp i16:$a)),
- (CVT_f32_u16 $a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp i32:$a)),
- (CVT_f32_u32 $a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp i64:$a)),
- (CVT_f32_u64 $a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp i1:$a)), (CVT_f32_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp i16:$a)), (CVT_f32_u16 $a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp i32:$a)), (CVT_f32_u32 $a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp i64:$a)), (CVT_f32_u64 $a, CvtRN)>;
// sint -> f64
-def : Pat<(f64 (sint_to_fp i1:$a)),
- (CVT_f64_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
-def : Pat<(f64 (sint_to_fp i16:$a)),
- (CVT_f64_s16 $a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp i32:$a)),
- (CVT_f64_s32 $a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp i64:$a)),
- (CVT_f64_s64 $a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp i1:$a)), (CVT_f64_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp i16:$a)), (CVT_f64_s16 $a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp i32:$a)), (CVT_f64_s32 $a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp i64:$a)), (CVT_f64_s64 $a, CvtRN)>;
// uint -> f64
-def : Pat<(f64 (uint_to_fp i1:$a)),
- (CVT_f64_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
-def : Pat<(f64 (uint_to_fp i16:$a)),
- (CVT_f64_u16 $a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp i32:$a)),
- (CVT_f64_u32 $a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp i64:$a)),
- (CVT_f64_u64 $a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp i1:$a)), (CVT_f64_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp i16:$a)), (CVT_f64_u16 $a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp i32:$a)), (CVT_f64_u32 $a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp i64:$a)), (CVT_f64_u64 $a, CvtRN)>;
// f16 -> sint
-def : Pat<(i1 (fp_to_sint f16:$a)),
- (SETP_b16ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint f16:$a)),
- (CVT_s16_f16 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint f16:$a)),
- (CVT_s32_f16 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint f16:$a)),
- (CVT_s64_f16 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint f16:$a)), (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint f16:$a)), (CVT_s16_f16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint f16:$a)), (CVT_s32_f16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint f16:$a)), (CVT_s64_f16 $a, CvtRZI)>;
// f16 -> uint
-def : Pat<(i1 (fp_to_uint f16:$a)),
- (SETP_b16ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint f16:$a)),
- (CVT_u16_f16 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint f16:$a)),
- (CVT_u32_f16 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint f16:$a)),
- (CVT_u64_f16 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint f16:$a)), (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint f16:$a)), (CVT_u16_f16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint f16:$a)), (CVT_u32_f16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint f16:$a)), (CVT_u64_f16 $a, CvtRZI)>;
// bf16 -> sint
-def : Pat<(i1 (fp_to_sint bf16:$a)),
- (SETP_b16ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint bf16:$a)),
- (CVT_s16_bf16 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint bf16:$a)),
- (CVT_s32_bf16 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint bf16:$a)),
- (CVT_s64_bf16 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint bf16:$a)), (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint bf16:$a)), (CVT_s16_bf16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint bf16:$a)), (CVT_s32_bf16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint bf16:$a)), (CVT_s64_bf16 $a, CvtRZI)>;
// bf16 -> uint
-def : Pat<(i1 (fp_to_uint bf16:$a)),
- (SETP_b16ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint bf16:$a)),
- (CVT_u16_bf16 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint bf16:$a)),
- (CVT_u32_bf16 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint bf16:$a)),
- (CVT_u64_bf16 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint bf16:$a)), (SETP_b16ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint bf16:$a)), (CVT_u16_bf16 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint bf16:$a)), (CVT_u32_bf16 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint bf16:$a)), (CVT_u64_bf16 $a, CvtRZI)>;
// f32 -> sint
-def : Pat<(i1 (fp_to_sint f32:$a)),
- (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint f32:$a)),
- (CVT_s16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_sint f32:$a)),
- (CVT_s16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint f32:$a)),
- (CVT_s32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_sint f32:$a)),
- (CVT_s32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint f32:$a)),
- (CVT_s64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_sint f32:$a)),
- (CVT_s64_f32 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI)>;
// f32 -> uint
-def : Pat<(i1 (fp_to_uint f32:$a)),
- (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint f32:$a)),
- (CVT_u16_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_uint f32:$a)),
- (CVT_u16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint f32:$a)),
- (CVT_u32_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_uint f32:$a)),
- (CVT_u32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint f32:$a)),
- (CVT_u64_f32 $a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_uint f32:$a)),
- (CVT_u64_f32 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>,
+ Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI)>;
// f64 -> sint
-def : Pat<(i1 (fp_to_sint f64:$a)),
- (SETP_b64ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint f64:$a)),
- (CVT_s16_f64 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint f64:$a)),
- (CVT_s32_f64 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint f64:$a)),
- (CVT_s64_f64 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_sint f64:$a)), (SETP_b64ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint f64:$a)), (CVT_s16_f64 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint f64:$a)), (CVT_s32_f64 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint f64:$a)), (CVT_s64_f64 $a, CvtRZI)>;
// f64 -> uint
-def : Pat<(i1 (fp_to_uint f64:$a)),
- (SETP_b64ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint f64:$a)),
- (CVT_u16_f64 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint f64:$a)),
- (CVT_u32_f64 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint f64:$a)),
- (CVT_u64_f64 $a, CvtRZI)>;
+def : Pat<(i1 (fp_to_uint f64:$a)), (SETP_b64ri $a, 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint f64:$a)), (CVT_u16_f64 $a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint f64:$a)), (CVT_u32_f64 $a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint f64:$a)), (CVT_u64_f64 $a, CvtRZI)>;
// sext i1
-def : Pat<(i16 (sext i1:$a)),
- (SELP_b16ii -1, 0, $a)>;
-def : Pat<(i32 (sext i1:$a)),
- (SELP_b32ii -1, 0, $a)>;
-def : Pat<(i64 (sext i1:$a)),
- (SELP_b64ii -1, 0, $a)>;
+def : Pat<(i16 (sext i1:$a)), (SELP_b16ii -1, 0, $a)>;
+def : Pat<(i32 (sext i1:$a)), (SELP_b32ii -1, 0, $a)>;
+def : Pat<(i64 (sext i1:$a)), (SELP_b64ii -1, 0, $a)>;
// zext i1
-def : Pat<(i16 (zext i1:$a)),
- (SELP_b16ii 1, 0, $a)>;
-def : Pat<(i32 (zext i1:$a)),
- (SELP_b32ii 1, 0, $a)>;
-def : Pat<(i64 (zext i1:$a)),
- (SELP_b64ii 1, 0, $a)>;
+def : Pat<(i16 (zext i1:$a)), (SELP_b16ii 1, 0, $a)>;
+def : Pat<(i32 (zext i1:$a)), (SELP_b32ii 1, 0, $a)>;
+def : Pat<(i64 (zext i1:$a)), (SELP_b64ii 1, 0, $a)>;
// anyext i1
-def : Pat<(i16 (anyext i1:$a)),
- (SELP_b16ii -1, 0, $a)>;
-def : Pat<(i32 (anyext i1:$a)),
- (SELP_b32ii -1, 0, $a)>;
-def : Pat<(i64 (anyext i1:$a)),
- (SELP_b64ii -1, 0, $a)>;
+def : Pat<(i16 (anyext i1:$a)), (SELP_b16ii -1, 0, $a)>;
+def : Pat<(i32 (anyext i1:$a)), (SELP_b32ii -1, 0, $a)>;
+def : Pat<(i64 (anyext i1:$a)), (SELP_b64ii -1, 0, $a)>;
// sext i16
-def : Pat<(i32 (sext i16:$a)),
- (CVT_s32_s16 $a, CvtNONE)>;
-def : Pat<(i64 (sext i16:$a)),
- (CVT_s64_s16 $a, CvtNONE)>;
+def : Pat<(i32 (sext i16:$a)), (CVT_s32_s16 $a, CvtNONE)>;
+def : Pat<(i64 (sext i16:$a)), (CVT_s64_s16 $a, CvtNONE)>;
// zext i16
-def : Pat<(i32 (zext i16:$a)),
- (CVT_u32_u16 $a, CvtNONE)>;
-def : Pat<(i64 (zext i16:$a)),
- (CVT_u64_u16 $a, CvtNONE)>;
+def : Pat<(i32 (zext i16:$a)), (CVT_u32_u16 $a, CvtNONE)>;
+def : Pat<(i64 (zext i16:$a)), (CVT_u64_u16 $a, CvtNONE)>;
// anyext i16
-def : Pat<(i32 (anyext i16:$a)),
- (CVT_u32_u16 $a, CvtNONE)>;
-def : Pat<(i64 (anyext i16:$a)),
- (CVT_u64_u16 $a, CvtNONE)>;
+def : Pat<(i32 (anyext i16:$a)), (CVT_u32_u16 $a, CvtNONE)>;
+def : Pat<(i64 (anyext i16:$a)), (CVT_u64_u16 $a, CvtNONE)>;
// sext i32
-def : Pat<(i64 (sext i32:$a)),
- (CVT_s64_s32 $a, CvtNONE)>;
+def : Pat<(i64 (sext i32:$a)), (CVT_s64_s32 $a, CvtNONE)>;
// zext i32
-def : Pat<(i64 (zext i32:$a)),
- (CVT_u64_u32 $a, CvtNONE)>;
+def : Pat<(i64 (zext i32:$a)), (CVT_u64_u32 $a, CvtNONE)>;
// anyext i32
-def : Pat<(i64 (anyext i32:$a)),
- (CVT_u64_u32 $a, CvtNONE)>;
+def : Pat<(i64 (anyext i32:$a)), (CVT_u64_u32 $a, CvtNONE)>;
// truncate i64
-def : Pat<(i32 (trunc i64:$a)),
- (CVT_u32_u64 $a, CvtNONE)>;
-def : Pat<(i16 (trunc i64:$a)),
- (CVT_u16_u64 $a, CvtNONE)>;
-def : Pat<(i1 (trunc i64:$a)),
- (SETP_b64ri (ANDb64ri $a, 1), 0, CmpNE)>;
+def : Pat<(i32 (trunc i64:$a)), (CVT_u32_u64 $a, CvtNONE)>;
+def : Pat<(i16 (trunc i64:$a)), (CVT_u16_u64 $a, CvtNONE)>;
+def : Pat<(i1 (trunc i64:$a)), (SETP_b64ri (ANDb64ri $a, 1), 0, CmpNE)>;
// truncate i32
-def : Pat<(i16 (trunc i32:$a)),
- (CVT_u16_u32 $a, CvtNONE)>;
-def : Pat<(i1 (trunc i32:$a)),
- (SETP_b32ri (ANDb32ri $a, 1), 0, CmpNE)>;
+def : Pat<(i16 (trunc i32:$a)), (CVT_u16_u32 $a, CvtNONE)>;
+def : Pat<(i1 (trunc i32:$a)), (SETP_b32ri (ANDb32ri $a, 1), 0, CmpNE)>;
// truncate i16
-def : Pat<(i1 (trunc i16:$a)),
- (SETP_b16ri (ANDb16ri $a, 1), 0, CmpNE)>;
+def : Pat<(i1 (trunc i16:$a)), (SETP_b16ri (ANDb16ri $a, 1), 0, CmpNE)>;
// sext_inreg
def : Pat<(sext_inreg i16:$a, i8), (CVT_INREG_s16_s8 $a)>;
@@ -2920,50 +2822,39 @@ let hasSideEffects = false in {
}
// fpround f32 -> f16
-def : Pat<(f16 (fpround f32:$a)),
- (CVT_f16_f32 $a, CvtRN)>;
+def : Pat<(f16 (fpround f32:$a)), (CVT_f16_f32 $a, CvtRN)>;
// fpround f32 -> bf16
-def : Pat<(bf16 (fpround f32:$a)),
- (CVT_bf16_f32 $a, CvtRN)>, Requires<[hasPTX<70>, hasSM<80>]>;
+def : Pat<(bf16 (fpround f32:$a)), (CVT_bf16_f32 $a, CvtRN)>,
+ Requires<[hasPTX<70>, hasSM<80>]>;
// fpround f64 -> f16
-def : Pat<(f16 (fpround f64:$a)),
- (CVT_f16_f64 $a, CvtRN)>;
+def : Pat<(f16 (fpround f64:$a)), (CVT_f16_f64 $a, CvtRN)>;
// fpround f64 -> bf16
-def : Pat<(bf16 (fpround f64:$a)),
- (CVT_bf16_f64 $a, CvtRN)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(bf16 (fpround f64:$a)), (CVT_bf16_f64 $a, CvtRN)>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+
// fpround f64 -> f32
-def : Pat<(f32 (fpround f64:$a)),
- (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpround f64:$a)),
- (CVT_f32_f64 $a, CvtRN)>;
+def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
// fpextend f16 -> f32
-def : Pat<(f32 (fpextend f16:$a)),
- (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpextend f16:$a)),
- (CVT_f32_f16 $a, CvtNONE)>;
+def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
// fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)),
- (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpextend bf16:$a)),
- (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
// fpextend f16 -> f64
-def : Pat<(f64 (fpextend f16:$a)),
- (CVT_f64_f16 $a, CvtNONE)>;
+def : Pat<(f64 (fpextend f16:$a)), (CVT_f64_f16 $a, CvtNONE)>;
// fpextend bf16 -> f64
-def : Pat<(f64 (fpextend bf16:$a)),
- (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
+def : Pat<(f64 (fpextend bf16:$a)), (CVT_f64_bf16 $a, CvtNONE)>, Requires<[hasPTX<78>, hasSM<90>]>;
// fpextend f32 -> f64
-def : Pat<(f64 (fpextend f32:$a)),
- (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f64 (fpextend f32:$a)),
- (CVT_f64_f32 $a, CvtNONE)>;
+def : Pat<(f64 (fpextend f32:$a)), (CVT_f64_f32 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend f32:$a)), (CVT_f64_f32 $a, CvtNONE)>;
def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
@@ -2971,16 +2862,11 @@ def retglue : SDNode<"NVPTXISD::RET_GLUE", SDTNone,
// fceil, ffloor, froundeven, ftrunc.
multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
- def : Pat<(OpNode f16:$a),
- (CVT_f16_f16 $a, Mode)>;
- def : Pat<(OpNode bf16:$a),
- (CVT_bf16_bf16 $a, Mode)>;
- def : Pat<(OpNode f32:$a),
- (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>;
- def : Pat<(OpNode f32:$a),
- (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>;
- def : Pat<(OpNode f64:$a),
- (CVT_f64_f64 $a, Mode)>;
+ def : Pat<(OpNode f16:$a), (CVT_f16_f16 $a, Mode)>;
+ def : Pat<(OpNode bf16:$a), (CVT_bf16_bf16 $a, Mode)>;
+ def : Pat<(OpNode f32:$a), (CVT_f32_f32 $a, ModeFTZ)>, Requires<[doF32FTZ]>;
+ def : Pat<(OpNode f32:$a), (CVT_f32_f32 $a, Mode)>, Requires<[doNoF32FTZ]>;
+ def : Pat<(OpNode f64:$a), (CVT_f64_f64 $a, Mode)>;
}
defm : CVT_ROUND<fceil, CvtRPI, CvtRPI_FTZ>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5002d1bd2ca09..4b21a80846ee0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1862,12 +1862,12 @@ def : Pat<(int_nvvm_f2bf16_rz f32:$a),
def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a),
(CVT_bf16_f32 $a, CvtRZ_RELU)>;
-def : Pat<(int_nvvm_lohi_i2d i32:$a i32:$b), (V2I32toI64 $a, $b)>;
+def : Pat<(int_nvvm_lohi_i2d i32:$a, i32:$b), (V2I32toI64 $a, $b)>;
def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>;
def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H $a)>;
-def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
+def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L_Sink $a)>, Requires<[hasPTX<71>]>;
+def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H_Sink $a)>, Requires<[hasPTX<71>]>;
def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a),
(CVT_s64_f32 $a, CvtRNI_FTZ)>;
>From e96fee71436b48db9017a59a97624c9c897bdf62 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 27 May 2025 16:24:51 +0000
Subject: [PATCH 4/6] misc. cleanup
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 47 +-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 632 ++++++------------
.../lib/Target/NVPTX/NVPTXProxyRegErasure.cpp | 10 +-
llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir | 14 +-
4 files changed, 254 insertions(+), 449 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index d07d79fb645fe..b646d39194c7e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -208,6 +208,9 @@ class ValueToRegClass<ValueType T> {
class OneUse1<SDPatternOperator operator>
: PatFrag<(ops node:$A), (operator node:$A), [{ return N->hasOneUse(); }]>;
+class OneUse2<SDPatternOperator operator>
+ : PatFrag<(ops node:$A, node:$B), (operator node:$A, node:$B), [{ return N->hasOneUse(); }]>;
+
class fpimm_pos_inf<ValueType vt>
: FPImmLeaf<vt, [{ return Imm.isPosInfinity(); }]>;
@@ -1071,9 +1074,7 @@ def : Pat<(mul (zext i16:$a), (i32 UInt16Const:$b)),
//
// Integer multiply-add
//
-def mul_oneuse : PatFrag<(ops node:$a, node:$b), (mul node:$a, node:$b), [{
- return N->hasOneUse();
-}]>;
+def mul_oneuse : OneUse2<mul>;
multiclass MAD<string Ptx, ValueType VT, NVPTXRegClass Reg, Operand Imm> {
def rrr:
@@ -1784,14 +1785,16 @@ let hasSideEffects = false, isAsCheapAsAMove = true in {
}
def IMOV1r : MOVr<Int1Regs, "pred">;
-def IMOV1i : MOVi<Int1Regs, "pred", i1, i1imm, imm>;
def MOV16r : MOVr<Int16Regs, "b16">;
-def IMOV16i : MOVi<Int16Regs, "b16", i16, i16imm, imm>;
def IMOV32r : MOVr<Int32Regs, "b32">;
-def IMOV32i : MOVi<Int32Regs, "b32", i32, i32imm, imm>;
def IMOV64r : MOVr<Int64Regs, "b64">;
-def IMOV64i : MOVi<Int64Regs, "b64", i64, i64imm, imm>;
def IMOV128r : MOVr<Int128Regs, "b128">;
+
+
+def IMOV1i : MOVi<Int1Regs, "pred", i1, i1imm, imm>;
+def IMOV16i : MOVi<Int16Regs, "b16", i16, i16imm, imm>;
+def IMOV32i : MOVi<Int32Regs, "b32", i32, i32imm, imm>;
+def IMOV64i : MOVi<Int64Regs, "b64", i64, i64imm, imm>;
def FMOV16i : MOVi<Int16Regs, "b16", f16, f16imm, fpimm>;
def BFMOV16i : MOVi<Int16Regs, "b16", bf16, bf16imm, fpimm>;
def FMOV32i : MOVi<Float32Regs, "b32", f32, f32imm, fpimm>;
@@ -2336,7 +2339,7 @@ def DeclareScalarRegInst :
".reg .b$size param$a;",
[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
-class MoveParamSymbolInst<RegTypeInfo t> :
+class MoveParamSymbolInst<RegTyInfo t> :
BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
"mov.b" # t.Size,
[(set t.Ty:$dst, (MoveParam texternalsym:$src))]>;
@@ -2355,21 +2358,18 @@ def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs, i16>;
def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs, f64>;
def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs, f32>;
-class ProxyRegInst<string SzStr, NVPTXRegClass regclass> :
- BasicNVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+multiclass ProxyRegInst<string SzStr, NVPTXRegClass rc> {
+ def NAME : BasicNVPTXInst<(outs rc:$dst), (ins rc:$src),
"mov." # SzStr>;
-
-def ProxyRegB1 : ProxyRegInst<"pred", Int1Regs>;
-def ProxyRegB16 : ProxyRegInst<"b16", Int16Regs>;
-def ProxyRegB32 : ProxyRegInst<"b32", Int32Regs>;
-def ProxyRegB64 : ProxyRegInst<"b64", Int64Regs>;
-
-foreach rc = [Int1Regs, Int16Regs, Int32Regs, Int64Regs] in {
- defvar ProxyRegInst = cast<NVPTXInst>(ProxyRegB # rc.Size);
foreach vt = rc.RegTypes in
- def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegInst $src)>;
+ def : Pat<(vt (ProxyReg vt:$src)), (!cast<NVPTXInst>(NAME) $src)>;
}
+defm ProxyRegB1 : ProxyRegInst<"pred", Int1Regs>;
+defm ProxyRegB16 : ProxyRegInst<"b16", Int16Regs>;
+defm ProxyRegB32 : ProxyRegInst<"b32", Int32Regs>;
+defm ProxyRegB64 : ProxyRegInst<"b64", Int64Regs>;
+
//
// Load / Store Handling
//
@@ -2745,13 +2745,10 @@ foreach vt = [v2f16, v2bf16, v2i16] in {
def : Pat<(extractelt vt:$src, 0), (I32toI16L $src)>;
def : Pat<(extractelt vt:$src, 1), (I32toI16H $src)>;
+
+ def : Pat<(vt (build_vector vt.ElementType:$a, vt.ElementType:$b)),
+ (V2I16toI32 $a, $b)>;
}
-def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
- (V2I16toI32 $a, $b)>;
-def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
- (V2I16toI32 $a, $b)>;
-def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
- (V2I16toI32 $a, $b)>;
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
(CVT_u32_u16 $a, CvtNONE)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 4b21a80846ee0..1eb2243037b07 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -59,10 +59,6 @@ class RegSeq<int n, string prefix> {
[]);
}
-class THREADMASK_INFO<bit sync> {
- list<bit> ret = !if(sync, [0, 1], [0]);
-}
-
//-----------------------------------
// Synchronization and shuffle functions
//-----------------------------------
@@ -161,55 +157,42 @@ def barrier_cluster_arrive_relaxed_aligned:
def barrier_cluster_wait_aligned:
INT_BARRIER_CLUSTER<"wait.aligned", int_nvvm_barrier_cluster_wait_aligned>;
-class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
- bit offset_imm, bit mask_imm, bit threadmask_imm>
- : NVPTXInst<(outs), (ins), "?", []> {
- NVPTXRegClass rc = !cond(
- !eq(reg, "i32"): Int32Regs,
- !eq(reg, "f32"): Float32Regs);
- string IntrName = "int_nvvm_shfl_"
- # !if(sync, "sync_", "")
- # mode
- # "_" # reg
- # !if(return_pred, "p", "");
- Intrinsic Intr = !cast<Intrinsic>(IntrName);
- let InOperandList = !con(
- !if(sync,
- !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
- (ins)),
- (ins rc:$src),
- !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
- !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
- );
- let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
- let AsmString = "shfl."
- # !if(sync, "sync.", "")
- # mode # ".b32\t"
- # "$dst"
- # !if(return_pred, "|$pred", "") # ", "
- # "$src, $offset, $mask"
- # !if(sync, ", $threadmask", "")
- # ";"
- ;
- let Pattern = [!con(
- !foreach(tmp, OutOperandList,
- !subst(outs, set,
- !subst(i32imm, imm, tmp))),
- (set !foreach(tmp, InOperandList,
- !subst(ins, Intr,
- !subst(i32imm, imm, tmp))))
- )];
-}
-
foreach sync = [false, true] in {
foreach mode = ["up", "down", "bfly", "idx"] in {
foreach regclass = ["i32", "f32"] in {
foreach return_pred = [false, true] in {
foreach offset_imm = [false, true] in {
foreach mask_imm = [false, true] in {
- foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
- def : SHFL_INSTR<sync, mode, regclass, return_pred,
- offset_imm, mask_imm, threadmask_imm>,
+ foreach threadmask_imm = !if(sync, [0, 1], [0]) in {
+ defvar Intr = !cast<Intrinsic>("int_nvvm_shfl_"
+ # !if(sync, "sync_", "")
+ # mode
+ # "_" # regclass
+ # !if(return_pred, "p", ""));
+ defvar InOperandList = !con(
+ (ins Int32Regs:$src),
+ !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
+ !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"]),
+ !if(sync,
+ !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
+ (ins)));
+ defvar Pattern = !con(
+ (set Int32Regs:$dst),
+ !if(return_pred, (set Int1Regs:$pred), (set)),
+ (set !con(
+ !if(sync,
+ !dag(Intr, !if(threadmask_imm, [imm], [Int32Regs]), ["threadmask"]),
+ (Intr)),
+ (Intr Int32Regs:$src),
+ !dag(Intr, !if(offset_imm, [imm], [Int32Regs]), ["offset"]),
+ !dag(Intr, !if(mask_imm, [imm], [Int32Regs]), ["mask"]))));
+
+ def : BasicNVPTXInst<
+ !if(return_pred, (outs Int32Regs:$dst, Int1Regs:$pred),
+ (outs Int32Regs:$dst)),
+ InOperandList,
+ "shfl." # !if(sync, "sync.", "") # mode # ".b32",
+ [Pattern]>,
Requires<!if(sync, [hasSM<30>, hasPTX<60>], [hasSM<30>, hasSHFL])>;
}
}
@@ -221,8 +204,8 @@ foreach sync = [false, true] in {
// vote.{all,any,uni,ballot}
multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
- "vote." # mode # " \t$dest, $pred;",
+ def : BasicNVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
+ "vote." # mode,
[(set regclass:$dest, (IntOp i1:$pred))]>,
Requires<[hasPTX<60>, hasSM<30>]>;
}
@@ -234,11 +217,11 @@ defm VOTE_BALLOT : VOTE<Int32Regs, "ballot.b32", int_nvvm_vote_ballot>;
// vote.sync.{all,any,uni,ballot}
multiclass VOTE_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- def i : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred, i32imm:$mask),
+ def i : BasicNVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred, i32imm:$mask),
"vote.sync." # mode,
[(set regclass:$dest, (IntOp imm:$mask, i1:$pred))]>,
Requires<[hasPTX<60>, hasSM<30>]>;
- def r : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred, Int32Regs:$mask),
+ def r : BasicNVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred, Int32Regs:$mask),
"vote.sync." # mode,
[(set regclass:$dest, (IntOp i32:$mask, i1:$pred))]>,
Requires<[hasPTX<60>, hasSM<30>]>;
@@ -292,22 +275,22 @@ defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_syn
multiclass MATCH_ALLP_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
Operand ImmOp> {
- def ii : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
+ def ii : BasicNVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
(ins ImmOp:$value, i32imm:$mask),
"match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp imm:$mask, imm:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
- def ir : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
+ def ir : BasicNVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
(ins ImmOp:$value, Int32Regs:$mask),
"match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp i32:$mask, imm:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
- def ri : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
+ def ri : BasicNVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
(ins regclass:$value, i32imm:$mask),
"match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp imm:$mask, regclass:$value))]>,
Requires<[hasPTX<60>, hasSM<70>]>;
- def rr : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
+ def rr : BasicNVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred),
(ins regclass:$value, Int32Regs:$mask),
"match.all.sync." # ptxtype,
[(set i32:$dest, i1:$pred, (IntOp i32:$mask, regclass:$value))]>,
@@ -828,13 +811,9 @@ def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
//-----------------------------------
multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
- !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
- [(Intrin i32:$addr, i32:$count)]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
- !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
- [(Intrin i64:$addr, i32:$count)]>,
+ def "" : NVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count),
+ "mbarrier.init" # AddrSpace # ".b64 [$addr], $count;",
+ [(Intrin addr:$addr, i32:$count)]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -843,13 +822,9 @@ defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
int_nvvm_mbarrier_init_shared>;
multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
- !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
- [(Intrin i32:$addr)]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
- !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
- [(Intrin i64:$addr)]>,
+ def "" : NVPTXInst<(outs), (ins ADDR:$addr),
+ "mbarrier.inval" # AddrSpace # ".b64 [$addr];",
+ [(Intrin addr:$addr)]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -858,13 +833,9 @@ defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
int_nvvm_mbarrier_inval_shared>;
multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
- !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
- [(set i64:$state, (Intrin i32:$addr))]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
- !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
- [(set i64:$state, (Intrin i64:$addr))]>,
+ def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
+ "mbarrier.arrive" # AddrSpace # ".b64 $state, [$addr];",
+ [(set i64:$state, (Intrin addr:$addr))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -873,17 +844,10 @@ defm MBARRIER_ARRIVE_SHARED :
MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs Int64Regs:$state),
- (ins Int32Regs:$addr, Int32Regs:$count),
- !strconcat("mbarrier.arrive.noComplete", AddrSpace,
- ".b64 $state, [$addr], $count;"),
- [(set i64:$state, (Intrin i32:$addr, i32:$count))]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs Int64Regs:$state),
- (ins Int64Regs:$addr, Int32Regs:$count),
- !strconcat("mbarrier.arrive.noComplete", AddrSpace,
- ".b64 $state, [$addr], $count;"),
- [(set i64:$state, (Intrin i64:$addr, i32:$count))]>,
+ def "" : NVPTXInst<(outs Int64Regs:$state),
+ (ins ADDR:$addr, Int32Regs:$count),
+ "mbarrier.arrive.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;",
+ [(set i64:$state, (Intrin addr:$addr, i32:$count))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -893,15 +857,9 @@ defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
- !strconcat("mbarrier.arrive_drop", AddrSpace,
- ".b64 $state, [$addr];"),
- [(set i64:$state, (Intrin i32:$addr))]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
- !strconcat("mbarrier.arrive_drop", AddrSpace,
- ".b64 $state, [$addr];"),
- [(set i64:$state, (Intrin i64:$addr))]>,
+ def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
+ "mbarrier.arrive_drop" # AddrSpace # ".b64 $state, [$addr];",
+ [(set i64:$state, (Intrin addr:$addr))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -911,17 +869,10 @@ defm MBARRIER_ARRIVE_DROP_SHARED :
MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs Int64Regs:$state),
- (ins Int32Regs:$addr, Int32Regs:$count),
- !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
- ".b64 $state, [$addr], $count;"),
- [(set i64:$state, (Intrin i32:$addr, i32:$count))]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs Int64Regs:$state),
- (ins Int64Regs:$addr, Int32Regs:$count),
- !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
- ".b64 $state, [$addr], $count;"),
- [(set i64:$state, (Intrin i64:$addr, i32:$count))]>,
+ def "" : NVPTXInst<(outs Int64Regs:$state),
+ (ins ADDR:$addr, Int32Regs:$count),
+ "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;",
+ [(set i64:$state, (Intrin addr:$addr, i32:$count))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -932,13 +883,9 @@ defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
- !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
- [(set i1:$res, (Intrin i32:$addr, i64:$state))]>,
- Requires<[hasPTX<70>, hasSM<80>]>;
- def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
- !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
- [(set i1:$res, (Intrin i64:$addr, i64:$state))]>,
+ def "" : NVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state),
+ "mbarrier.test_wait" # AddrSpace # ".b64 $res, [$addr], $state;",
+ [(set i1:$res, (Intrin addr:$addr, i64:$state))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
}
@@ -948,8 +895,8 @@ defm MBARRIER_TEST_WAIT_SHARED :
MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
- NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
- "mbarrier.pending_count.b64 $res, $state;",
+ BasicNVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
+ "mbarrier.pending_count.b64",
[(set i32:$res, (Intrin i64:$state))]>,
Requires<[hasPTX<70>, hasSM<80>]>;
@@ -1317,19 +1264,13 @@ def INT_NVVM_SAD_ULL : F_MATH_3<"sad.u64",
// Floor Ceil
//
-def : Pat<(int_nvvm_floor_ftz_f f32:$a),
- (CVT_f32_f32 $a, CvtRMI_FTZ)>;
-def : Pat<(int_nvvm_floor_f f32:$a),
- (CVT_f32_f32 $a, CvtRMI)>;
-def : Pat<(int_nvvm_floor_d f64:$a),
- (CVT_f64_f64 $a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_ftz_f f32:$a), (CVT_f32_f32 $a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_floor_f f32:$a), (CVT_f32_f32 $a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_d f64:$a), (CVT_f64_f64 $a, CvtRMI)>;
-def : Pat<(int_nvvm_ceil_ftz_f f32:$a),
- (CVT_f32_f32 $a, CvtRPI_FTZ)>;
-def : Pat<(int_nvvm_ceil_f f32:$a),
- (CVT_f32_f32 $a, CvtRPI)>;
-def : Pat<(int_nvvm_ceil_d f64:$a),
- (CVT_f64_f64 $a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_ftz_f f32:$a), (CVT_f32_f32 $a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_ceil_f f32:$a), (CVT_f32_f32 $a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_d f64:$a), (CVT_f64_f64 $a, CvtRPI)>;
//
// Abs
@@ -1379,34 +1320,25 @@ def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2", BF16X2RT,
// Round
//
-def : Pat<(int_nvvm_round_ftz_f f32:$a),
- (CVT_f32_f32 $a, CvtRNI_FTZ)>;
-def : Pat<(int_nvvm_round_f f32:$a),
- (CVT_f32_f32 $a, CvtRNI)>;
-def : Pat<(int_nvvm_round_d f64:$a),
- (CVT_f64_f64 $a, CvtRNI)>;
+def : Pat<(int_nvvm_round_ftz_f f32:$a), (CVT_f32_f32 $a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_round_f f32:$a), (CVT_f32_f32 $a, CvtRNI)>;
+def : Pat<(int_nvvm_round_d f64:$a), (CVT_f64_f64 $a, CvtRNI)>;
//
// Trunc
//
-def : Pat<(int_nvvm_trunc_ftz_f f32:$a),
- (CVT_f32_f32 $a, CvtRZI_FTZ)>;
-def : Pat<(int_nvvm_trunc_f f32:$a),
- (CVT_f32_f32 $a, CvtRZI)>;
-def : Pat<(int_nvvm_trunc_d f64:$a),
- (CVT_f64_f64 $a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_ftz_f f32:$a), (CVT_f32_f32 $a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_trunc_f f32:$a), (CVT_f32_f32 $a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_d f64:$a), (CVT_f64_f64 $a, CvtRZI)>;
//
// Saturate
//
-def : Pat<(int_nvvm_saturate_ftz_f f32:$a),
- (CVT_f32_f32 $a, CvtSAT_FTZ)>;
-def : Pat<(int_nvvm_saturate_f f32:$a),
- (CVT_f32_f32 $a, CvtSAT)>;
-def : Pat<(int_nvvm_saturate_d f64:$a),
- (CVT_f64_f64 $a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_ftz_f f32:$a), (CVT_f32_f32 $a, CvtSAT_FTZ)>;
+def : Pat<(int_nvvm_saturate_f f32:$a), (CVT_f32_f32 $a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_d f64:$a), (CVT_f64_f64 $a, CvtSAT)>;
//
// Exp2 Log2
@@ -1424,14 +1356,10 @@ def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16",
def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2",
F16X2RT, F16X2RT, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>;
-def : Pat<(fexp2 f32:$a),
- (INT_NVVM_EX2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
-def : Pat<(fexp2 f32:$a),
- (INT_NVVM_EX2_APPROX_F $a)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fexp2 f16:$a),
- (INT_NVVM_EX2_APPROX_F16 $a)>, Requires<[useFP16Math]>;
-def : Pat<(fexp2 v2f16:$a),
- (INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>;
+def : Pat<(fexp2 f32:$a), (INT_NVVM_EX2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(fexp2 f32:$a), (INT_NVVM_EX2_APPROX_F $a)>;
+def : Pat<(fexp2 f16:$a), (INT_NVVM_EX2_APPROX_F16 $a)>, Requires<[useFP16Math]>;
+def : Pat<(fexp2 v2f16:$a), (INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>;
def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32",
F32RT, F32RT, int_nvvm_lg2_approx_ftz_f>;
@@ -1540,31 +1468,19 @@ defm INT_NVVM_FMA : FMA_INST;
// Rcp
//
-def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32",
- F32RT, F32RT, int_nvvm_rcp_rn_ftz_f>;
-def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32",
- F32RT, F32RT, int_nvvm_rcp_rn_f>;
-def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32",
- F32RT, F32RT, int_nvvm_rcp_rz_ftz_f>;
-def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32",
- F32RT, F32RT, int_nvvm_rcp_rz_f>;
-def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32",
- F32RT, F32RT, int_nvvm_rcp_rm_ftz_f>;
-def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32",
- F32RT, F32RT, int_nvvm_rcp_rm_f>;
-def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32",
- F32RT, F32RT, int_nvvm_rcp_rp_ftz_f>;
-def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32",
- F32RT, F32RT, int_nvvm_rcp_rp_f>;
-
-def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64", F64RT,
- F64RT, int_nvvm_rcp_rn_d>;
-def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64", F64RT,
- F64RT, int_nvvm_rcp_rz_d>;
-def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64", F64RT,
- F64RT, int_nvvm_rcp_rm_d>;
-def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64", F64RT,
- F64RT, int_nvvm_rcp_rp_d>;
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32", F32RT, F32RT, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32", F32RT, F32RT, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32", F32RT, F32RT, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32", F32RT, F32RT, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32", F32RT, F32RT, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32", F32RT, F32RT, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32", F32RT, F32RT, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32", F32RT, F32RT, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64", F64RT, F64RT, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64", F64RT, F64RT, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64", F64RT, F64RT, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64", F64RT, F64RT, int_nvvm_rcp_rp_d>;
def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32",
F32RT, F32RT, int_nvvm_rcp_approx_ftz_f>;
@@ -1596,14 +1512,10 @@ def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32",
def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32",
F32RT, F32RT, int_nvvm_sqrt_approx_f>;
-def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64", F64RT,
- F64RT, int_nvvm_sqrt_rn_d>;
-def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64", F64RT,
- F64RT, int_nvvm_sqrt_rz_d>;
-def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64", F64RT,
- F64RT, int_nvvm_sqrt_rm_d>;
-def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64", F64RT,
- F64RT, int_nvvm_sqrt_rp_d>;
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64", F64RT, F64RT, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64", F64RT, F64RT, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64", F64RT, F64RT, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64", F64RT, F64RT, int_nvvm_sqrt_rp_d>;
// nvvm_sqrt intrinsic
def : Pat<(int_nvvm_sqrt_f f32:$a),
@@ -1730,137 +1642,77 @@ foreach mode = ["wrap", "clamp"] in {
// Convert
//
-def : Pat<(int_nvvm_d2f_rn_ftz f64:$a),
- (CVT_f32_f64 $a, CvtRN_FTZ)>;
-def : Pat<(int_nvvm_d2f_rn f64:$a),
- (CVT_f32_f64 $a, CvtRN)>;
-def : Pat<(int_nvvm_d2f_rz_ftz f64:$a),
- (CVT_f32_f64 $a, CvtRZ_FTZ)>;
-def : Pat<(int_nvvm_d2f_rz f64:$a),
- (CVT_f32_f64 $a, CvtRZ)>;
-def : Pat<(int_nvvm_d2f_rm_ftz f64:$a),
- (CVT_f32_f64 $a, CvtRM_FTZ)>;
-def : Pat<(int_nvvm_d2f_rm f64:$a),
- (CVT_f32_f64 $a, CvtRM)>;
-def : Pat<(int_nvvm_d2f_rp_ftz f64:$a),
- (CVT_f32_f64 $a, CvtRP_FTZ)>;
-def : Pat<(int_nvvm_d2f_rp f64:$a),
- (CVT_f32_f64 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_d2i_rn f64:$a),
- (CVT_s32_f64 $a, CvtRNI)>;
-def : Pat<(int_nvvm_d2i_rz f64:$a),
- (CVT_s32_f64 $a, CvtRZI)>;
-def : Pat<(int_nvvm_d2i_rm f64:$a),
- (CVT_s32_f64 $a, CvtRMI)>;
-def : Pat<(int_nvvm_d2i_rp f64:$a),
- (CVT_s32_f64 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_d2ui_rn f64:$a),
- (CVT_u32_f64 $a, CvtRNI)>;
-def : Pat<(int_nvvm_d2ui_rz f64:$a),
- (CVT_u32_f64 $a, CvtRZI)>;
-def : Pat<(int_nvvm_d2ui_rm f64:$a),
- (CVT_u32_f64 $a, CvtRMI)>;
-def : Pat<(int_nvvm_d2ui_rp f64:$a),
- (CVT_u32_f64 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_i2d_rn i32:$a),
- (CVT_f64_s32 $a, CvtRN)>;
-def : Pat<(int_nvvm_i2d_rz i32:$a),
- (CVT_f64_s32 $a, CvtRZ)>;
-def : Pat<(int_nvvm_i2d_rm i32:$a),
- (CVT_f64_s32 $a, CvtRM)>;
-def : Pat<(int_nvvm_i2d_rp i32:$a),
- (CVT_f64_s32 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_ui2d_rn i32:$a),
- (CVT_f64_u32 $a, CvtRN)>;
-def : Pat<(int_nvvm_ui2d_rz i32:$a),
- (CVT_f64_u32 $a, CvtRZ)>;
-def : Pat<(int_nvvm_ui2d_rm i32:$a),
- (CVT_f64_u32 $a, CvtRM)>;
-def : Pat<(int_nvvm_ui2d_rp i32:$a),
- (CVT_f64_u32 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_f2i_rn_ftz f32:$a),
- (CVT_s32_f32 $a, CvtRNI_FTZ)>;
-def : Pat<(int_nvvm_f2i_rn f32:$a),
- (CVT_s32_f32 $a, CvtRNI)>;
-def : Pat<(int_nvvm_f2i_rz_ftz f32:$a),
- (CVT_s32_f32 $a, CvtRZI_FTZ)>;
-def : Pat<(int_nvvm_f2i_rz f32:$a),
- (CVT_s32_f32 $a, CvtRZI)>;
-def : Pat<(int_nvvm_f2i_rm_ftz f32:$a),
- (CVT_s32_f32 $a, CvtRMI_FTZ)>;
-def : Pat<(int_nvvm_f2i_rm f32:$a),
- (CVT_s32_f32 $a, CvtRMI)>;
-def : Pat<(int_nvvm_f2i_rp_ftz f32:$a),
- (CVT_s32_f32 $a, CvtRPI_FTZ)>;
-def : Pat<(int_nvvm_f2i_rp f32:$a),
- (CVT_s32_f32 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_f2ui_rn_ftz f32:$a),
- (CVT_u32_f32 $a, CvtRNI_FTZ)>;
-def : Pat<(int_nvvm_f2ui_rn f32:$a),
- (CVT_u32_f32 $a, CvtRNI)>;
-def : Pat<(int_nvvm_f2ui_rz_ftz f32:$a),
- (CVT_u32_f32 $a, CvtRZI_FTZ)>;
-def : Pat<(int_nvvm_f2ui_rz f32:$a),
- (CVT_u32_f32 $a, CvtRZI)>;
-def : Pat<(int_nvvm_f2ui_rm_ftz f32:$a),
- (CVT_u32_f32 $a, CvtRMI_FTZ)>;
-def : Pat<(int_nvvm_f2ui_rm f32:$a),
- (CVT_u32_f32 $a, CvtRMI)>;
-def : Pat<(int_nvvm_f2ui_rp_ftz f32:$a),
- (CVT_u32_f32 $a, CvtRPI_FTZ)>;
-def : Pat<(int_nvvm_f2ui_rp f32:$a),
- (CVT_u32_f32 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_i2f_rn i32:$a),
- (CVT_f32_s32 $a, CvtRN)>;
-def : Pat<(int_nvvm_i2f_rz i32:$a),
- (CVT_f32_s32 $a, CvtRZ)>;
-def : Pat<(int_nvvm_i2f_rm i32:$a),
- (CVT_f32_s32 $a, CvtRM)>;
-def : Pat<(int_nvvm_i2f_rp i32:$a),
- (CVT_f32_s32 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_ui2f_rn i32:$a),
- (CVT_f32_u32 $a, CvtRN)>;
-def : Pat<(int_nvvm_ui2f_rz i32:$a),
- (CVT_f32_u32 $a, CvtRZ)>;
-def : Pat<(int_nvvm_ui2f_rm i32:$a),
- (CVT_f32_u32 $a, CvtRM)>;
-def : Pat<(int_nvvm_ui2f_rp i32:$a),
- (CVT_f32_u32 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b),
- (CVT_bf16x2_f32 $a, $b, CvtRN)>;
-def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b),
- (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>;
-def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b),
- (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
-def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b),
- (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
-
-def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b),
- (CVT_f16x2_f32 $a, $b, CvtRN)>;
-def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b),
- (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
-def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b),
- (CVT_f16x2_f32 $a, $b, CvtRZ)>;
-def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b),
- (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
-
-def : Pat<(int_nvvm_f2bf16_rn f32:$a),
- (CVT_bf16_f32 $a, CvtRN)>;
-def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a),
- (CVT_bf16_f32 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_f2bf16_rz f32:$a),
- (CVT_bf16_f32 $a, CvtRZ)>;
-def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a),
- (CVT_bf16_f32 $a, CvtRZ_RELU)>;
+def : Pat<(int_nvvm_d2f_rn_ftz f64:$a), (CVT_f32_f64 $a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_d2f_rn f64:$a), (CVT_f32_f64 $a, CvtRN)>;
+def : Pat<(int_nvvm_d2f_rz_ftz f64:$a), (CVT_f32_f64 $a, CvtRZ_FTZ)>;
+def : Pat<(int_nvvm_d2f_rz f64:$a), (CVT_f32_f64 $a, CvtRZ)>;
+def : Pat<(int_nvvm_d2f_rm_ftz f64:$a), (CVT_f32_f64 $a, CvtRM_FTZ)>;
+def : Pat<(int_nvvm_d2f_rm f64:$a), (CVT_f32_f64 $a, CvtRM)>;
+def : Pat<(int_nvvm_d2f_rp_ftz f64:$a), (CVT_f32_f64 $a, CvtRP_FTZ)>;
+def : Pat<(int_nvvm_d2f_rp f64:$a), (CVT_f32_f64 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_d2i_rn f64:$a), (CVT_s32_f64 $a, CvtRNI)>;
+def : Pat<(int_nvvm_d2i_rz f64:$a), (CVT_s32_f64 $a, CvtRZI)>;
+def : Pat<(int_nvvm_d2i_rm f64:$a), (CVT_s32_f64 $a, CvtRMI)>;
+def : Pat<(int_nvvm_d2i_rp f64:$a), (CVT_s32_f64 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ui_rn f64:$a), (CVT_u32_f64 $a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ui_rz f64:$a), (CVT_u32_f64 $a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ui_rm f64:$a), (CVT_u32_f64 $a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ui_rp f64:$a), (CVT_u32_f64 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2d_rn i32:$a), (CVT_f64_s32 $a, CvtRN)>;
+def : Pat<(int_nvvm_i2d_rz i32:$a), (CVT_f64_s32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_i2d_rm i32:$a), (CVT_f64_s32 $a, CvtRM)>;
+def : Pat<(int_nvvm_i2d_rp i32:$a), (CVT_f64_s32 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2d_rn i32:$a), (CVT_f64_u32 $a, CvtRN)>;
+def : Pat<(int_nvvm_ui2d_rz i32:$a), (CVT_f64_u32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2d_rm i32:$a), (CVT_f64_u32 $a, CvtRM)>;
+def : Pat<(int_nvvm_ui2d_rp i32:$a), (CVT_f64_u32 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_f2i_rn_ftz f32:$a), (CVT_s32_f32 $a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rn f32:$a), (CVT_s32_f32 $a, CvtRNI)>;
+def : Pat<(int_nvvm_f2i_rz_ftz f32:$a), (CVT_s32_f32 $a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rz f32:$a), (CVT_s32_f32 $a, CvtRZI)>;
+def : Pat<(int_nvvm_f2i_rm_ftz f32:$a), (CVT_s32_f32 $a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rm f32:$a), (CVT_s32_f32 $a, CvtRMI)>;
+def : Pat<(int_nvvm_f2i_rp_ftz f32:$a), (CVT_s32_f32 $a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rp f32:$a), (CVT_s32_f32 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ui_rn_ftz f32:$a), (CVT_u32_f32 $a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rn f32:$a), (CVT_u32_f32 $a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ui_rz_ftz f32:$a), (CVT_u32_f32 $a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rz f32:$a), (CVT_u32_f32 $a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ui_rm_ftz f32:$a), (CVT_u32_f32 $a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rm f32:$a), (CVT_u32_f32 $a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ui_rp_ftz f32:$a), (CVT_u32_f32 $a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rp f32:$a), (CVT_u32_f32 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2f_rn i32:$a), (CVT_f32_s32 $a, CvtRN)>;
+def : Pat<(int_nvvm_i2f_rz i32:$a), (CVT_f32_s32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_i2f_rm i32:$a), (CVT_f32_s32 $a, CvtRM)>;
+def : Pat<(int_nvvm_i2f_rp i32:$a), (CVT_f32_s32 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2f_rn i32:$a), (CVT_f32_u32 $a, CvtRN)>;
+def : Pat<(int_nvvm_ui2f_rz i32:$a), (CVT_f32_u32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2f_rm i32:$a), (CVT_f32_u32 $a, CvtRM)>;
+def : Pat<(int_nvvm_ui2f_rp i32:$a), (CVT_f32_u32 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRN)>;
+def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>;
+def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
+def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
+
+def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN)>;
+def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
+def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>;
+def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
+
+def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>;
+def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>;
+def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>;
+def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>;
def : Pat<(int_nvvm_lohi_i2d i32:$a, i32:$b), (V2I32toI64 $a, $b)>;
def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>;
@@ -1869,99 +1721,57 @@ def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H $a)>;
def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L_Sink $a)>, Requires<[hasPTX<71>]>;
def : Pat<(int_nvvm_d2i_hi f64:$a), (I64toI32H_Sink $a)>, Requires<[hasPTX<71>]>;
-def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a),
- (CVT_s64_f32 $a, CvtRNI_FTZ)>;
-def : Pat<(int_nvvm_f2ll_rn f32:$a),
- (CVT_s64_f32 $a, CvtRNI)>;
-def : Pat<(int_nvvm_f2ll_rz_ftz f32:$a),
- (CVT_s64_f32 $a, CvtRZI_FTZ)>;
-def : Pat<(int_nvvm_f2ll_rz f32:$a),
- (CVT_s64_f32 $a, CvtRZI)>;
-def : Pat<(int_nvvm_f2ll_rm_ftz f32:$a),
- (CVT_s64_f32 $a, CvtRMI_FTZ)>;
-def : Pat<(int_nvvm_f2ll_rm f32:$a),
- (CVT_s64_f32 $a, CvtRMI)>;
-def : Pat<(int_nvvm_f2ll_rp_ftz f32:$a),
- (CVT_s64_f32 $a, CvtRPI_FTZ)>;
-def : Pat<(int_nvvm_f2ll_rp f32:$a),
- (CVT_s64_f32 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_f2ull_rn_ftz f32:$a),
- (CVT_u64_f32 $a, CvtRNI_FTZ)>;
-def : Pat<(int_nvvm_f2ull_rn f32:$a),
- (CVT_u64_f32 $a, CvtRNI)>;
-def : Pat<(int_nvvm_f2ull_rz_ftz f32:$a),
- (CVT_u64_f32 $a, CvtRZI_FTZ)>;
-def : Pat<(int_nvvm_f2ull_rz f32:$a),
- (CVT_u64_f32 $a, CvtRZI)>;
-def : Pat<(int_nvvm_f2ull_rm_ftz f32:$a),
- (CVT_u64_f32 $a, CvtRMI_FTZ)>;
-def : Pat<(int_nvvm_f2ull_rm f32:$a),
- (CVT_u64_f32 $a, CvtRMI)>;
-def : Pat<(int_nvvm_f2ull_rp_ftz f32:$a),
- (CVT_u64_f32 $a, CvtRPI_FTZ)>;
-def : Pat<(int_nvvm_f2ull_rp f32:$a),
- (CVT_u64_f32 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_d2ll_rn f64:$a),
- (CVT_s64_f64 $a, CvtRNI)>;
-def : Pat<(int_nvvm_d2ll_rz f64:$a),
- (CVT_s64_f64 $a, CvtRZI)>;
-def : Pat<(int_nvvm_d2ll_rm f64:$a),
- (CVT_s64_f64 $a, CvtRMI)>;
-def : Pat<(int_nvvm_d2ll_rp f64:$a),
- (CVT_s64_f64 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_d2ull_rn f64:$a),
- (CVT_u64_f64 $a, CvtRNI)>;
-def : Pat<(int_nvvm_d2ull_rz f64:$a),
- (CVT_u64_f64 $a, CvtRZI)>;
-def : Pat<(int_nvvm_d2ull_rm f64:$a),
- (CVT_u64_f64 $a, CvtRMI)>;
-def : Pat<(int_nvvm_d2ull_rp f64:$a),
- (CVT_u64_f64 $a, CvtRPI)>;
-
-def : Pat<(int_nvvm_ll2f_rn i64:$a),
- (CVT_f32_s64 $a, CvtRN)>;
-def : Pat<(int_nvvm_ll2f_rz i64:$a),
- (CVT_f32_s64 $a, CvtRZ)>;
-def : Pat<(int_nvvm_ll2f_rm i64:$a),
- (CVT_f32_s64 $a, CvtRM)>;
-def : Pat<(int_nvvm_ll2f_rp i64:$a),
- (CVT_f32_s64 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_ull2f_rn i64:$a),
- (CVT_f32_u64 $a, CvtRN)>;
-def : Pat<(int_nvvm_ull2f_rz i64:$a),
- (CVT_f32_u64 $a, CvtRZ)>;
-def : Pat<(int_nvvm_ull2f_rm i64:$a),
- (CVT_f32_u64 $a, CvtRM)>;
-def : Pat<(int_nvvm_ull2f_rp i64:$a),
- (CVT_f32_u64 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_ll2d_rn i64:$a),
- (CVT_f64_s64 $a, CvtRN)>;
-def : Pat<(int_nvvm_ll2d_rz i64:$a),
- (CVT_f64_s64 $a, CvtRZ)>;
-def : Pat<(int_nvvm_ll2d_rm i64:$a),
- (CVT_f64_s64 $a, CvtRM)>;
-def : Pat<(int_nvvm_ll2d_rp i64:$a),
- (CVT_f64_s64 $a, CvtRP)>;
-
-def : Pat<(int_nvvm_ull2d_rn i64:$a),
- (CVT_f64_u64 $a, CvtRN)>;
-def : Pat<(int_nvvm_ull2d_rz i64:$a),
- (CVT_f64_u64 $a, CvtRZ)>;
-def : Pat<(int_nvvm_ull2d_rm i64:$a),
- (CVT_f64_u64 $a, CvtRM)>;
-def : Pat<(int_nvvm_ull2d_rp i64:$a),
- (CVT_f64_u64 $a, CvtRP)>;
-
-
-def : Pat<(int_nvvm_f2h_rn_ftz f32:$a),
- (CVT_f16_f32 $a, CvtRN_FTZ)>;
-def : Pat<(int_nvvm_f2h_rn f32:$a),
- (CVT_f16_f32 $a, CvtRN)>;
+def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a), (CVT_s64_f32 $a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rn f32:$a), (CVT_s64_f32 $a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ll_rz_ftz f32:$a), (CVT_s64_f32 $a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rz f32:$a), (CVT_s64_f32 $a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ll_rm_ftz f32:$a), (CVT_s64_f32 $a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rm f32:$a), (CVT_s64_f32 $a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ll_rp_ftz f32:$a), (CVT_s64_f32 $a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rp f32:$a), (CVT_s64_f32 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ull_rn_ftz f32:$a), (CVT_u64_f32 $a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rn f32:$a), (CVT_u64_f32 $a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ull_rz_ftz f32:$a), (CVT_u64_f32 $a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rz f32:$a), (CVT_u64_f32 $a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ull_rm_ftz f32:$a), (CVT_u64_f32 $a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rm f32:$a), (CVT_u64_f32 $a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ull_rp_ftz f32:$a), (CVT_u64_f32 $a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rp f32:$a), (CVT_u64_f32 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ll_rn f64:$a), (CVT_s64_f64 $a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ll_rz f64:$a), (CVT_s64_f64 $a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ll_rm f64:$a), (CVT_s64_f64 $a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ll_rp f64:$a), (CVT_s64_f64 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ull_rn f64:$a), (CVT_u64_f64 $a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ull_rz f64:$a), (CVT_u64_f64 $a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ull_rm f64:$a), (CVT_u64_f64 $a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ull_rp f64:$a), (CVT_u64_f64 $a, CvtRPI)>;
+
+def : Pat<(int_nvvm_ll2f_rn i64:$a), (CVT_f32_s64 $a, CvtRN)>;
+def : Pat<(int_nvvm_ll2f_rz i64:$a), (CVT_f32_s64 $a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2f_rm i64:$a), (CVT_f32_s64 $a, CvtRM)>;
+def : Pat<(int_nvvm_ll2f_rp i64:$a), (CVT_f32_s64 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2f_rn i64:$a), (CVT_f32_u64 $a, CvtRN)>;
+def : Pat<(int_nvvm_ull2f_rz i64:$a), (CVT_f32_u64 $a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2f_rm i64:$a), (CVT_f32_u64 $a, CvtRM)>;
+def : Pat<(int_nvvm_ull2f_rp i64:$a), (CVT_f32_u64 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_ll2d_rn i64:$a), (CVT_f64_s64 $a, CvtRN)>;
+def : Pat<(int_nvvm_ll2d_rz i64:$a), (CVT_f64_s64 $a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2d_rm i64:$a), (CVT_f64_s64 $a, CvtRM)>;
+def : Pat<(int_nvvm_ll2d_rp i64:$a), (CVT_f64_s64 $a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2d_rn i64:$a), (CVT_f64_u64 $a, CvtRN)>;
+def : Pat<(int_nvvm_ull2d_rz i64:$a), (CVT_f64_u64 $a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2d_rm i64:$a), (CVT_f64_u64 $a, CvtRM)>;
+def : Pat<(int_nvvm_ull2d_rp i64:$a), (CVT_f64_u64 $a, CvtRP)>;
+
+
+def : Pat<(int_nvvm_f2h_rn_ftz f32:$a), (CVT_f16_f32 $a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_f2h_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>;
def : Pat<(int_nvvm_ff_to_e4m3x2_rn f32:$a, f32:$b),
(CVT_e4m3x2_f32 $a, $b, CvtRN)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
index 2253afa2806c3..bac036f3a93ae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp
@@ -58,12 +58,10 @@ bool NVPTXProxyRegErasure::runOnMachineFunction(MachineFunction &MF) {
for (auto &BB : MF) {
for (auto &MI : BB) {
switch (MI.getOpcode()) {
- case NVPTX::ProxyRegI1:
- case NVPTX::ProxyRegI16:
- case NVPTX::ProxyRegI32:
- case NVPTX::ProxyRegI64:
- case NVPTX::ProxyRegF32:
- case NVPTX::ProxyRegF64: {
+ case NVPTX::ProxyRegB1:
+ case NVPTX::ProxyRegB16:
+ case NVPTX::ProxyRegB32:
+ case NVPTX::ProxyRegB64: {
auto &InOp = *MI.uses().begin();
auto &OutOp = *MI.defs().begin();
assert(InOp.isReg() && "ProxyReg input should be a register.");
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
index fc60c056483ce..c2c87b6b24285 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
@@ -79,18 +79,18 @@ body: |
bb.0:
%0:int32regs, %1:int32regs, %2:int32regs, %3:int32regs = LoadParamMemV4I32 0
; CHECK-NOT: ProxyReg
- %4:int32regs = ProxyRegI32 killed %0
- %5:int32regs = ProxyRegI32 killed %1
- %6:int32regs = ProxyRegI32 killed %2
- %7:int32regs = ProxyRegI32 killed %3
+ %4:int32regs = ProxyRegB32 killed %0
+ %5:int32regs = ProxyRegB32 killed %1
+ %6:int32regs = ProxyRegB32 killed %2
+ %7:int32regs = ProxyRegB32 killed %3
; CHECK: StoreRetvalV4I32 killed %0, killed %1, killed %2, killed %3
StoreRetvalV4I32 killed %4, killed %5, killed %6, killed %7, 0
%8:int32regs = LoadParamMemI32 0
; CHECK-NOT: ProxyReg
- %9:int32regs = ProxyRegI32 killed %8
- %10:int32regs = ProxyRegI32 killed %9
- %11:int32regs = ProxyRegI32 killed %10
+ %9:int32regs = ProxyRegB32 killed %8
+ %10:int32regs = ProxyRegB32 killed %9
+ %11:int32regs = ProxyRegB32 killed %10
; CHECK: StoreRetvalI32 killed %8
StoreRetvalI32 killed %11, 0
Return
>From 49343b099d7d0d897cbc8f0ffa94736343638838 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 29 May 2025 14:10:07 +0000
Subject: [PATCH 5/6] update tests
---
.../CodeGen/MIR/NVPTX/expected-floating-point-literal.mir | 2 +-
.../CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir | 4 ++--
.../CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir b/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
index 40fc3c7952071..25dbc280c7a55 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
@@ -16,7 +16,7 @@ registers:
- { id: 1, class: int32regs }
body: |
bb.0.entry:
- %0 = LD_f32 0, 4, 1, 2, 32, &test_param_0, 0
+ %0 = LD_i32 0, 4, 1, 2, 32, &test_param_0, 0
; CHECK: [[@LINE+1]]:33: expected a floating point literal
%1 = FADD_rnf32ri %0, float 3
StoreRetvalF32 %1, 0
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 308f81a66ccb9..76655add75882 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -40,7 +40,7 @@ registers:
- { id: 7, class: int32regs }
body: |
bb.0.entry:
- %0 = LD_f32 0, 0, 4, 2, 32, &test_param_0, 0
+ %0 = LD_i32 0, 0, 4, 2, 32, &test_param_0, 0
%1 = CVT_f64_f32 %0, 0
%2 = LD_i32 0, 0, 4, 0, 32, &test_param_1, 0
; CHECK: %3:int64regs = FADD_rnf64ri %1, double 3.250000e+00
@@ -66,7 +66,7 @@ registers:
- { id: 7, class: int32regs }
body: |
bb.0.entry:
- %0 = LD_f32 0, 0, 4, 2, 32, &test2_param_0, 0
+ %0 = LD_i32 0, 0, 4, 2, 32, &test2_param_0, 0
%1 = CVT_f64_f32 %0, 0
%2 = LD_i32 0, 0, 4, 0, 32, &test2_param_1, 0
; CHECK: %3:int64regs = FADD_rnf64ri %1, double 0x7FF8000000000000
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
index 583fd626ec7b6..3545e864450f8 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
@@ -16,7 +16,7 @@ registers:
- { id: 1, class: int32regs }
body: |
bb.0.entry:
- %0 = LD_f32 0, 4, 1, 2, 32, &test_param_0, 0
+ %0 = LD_i32 0, 4, 1, 2, 32, &test_param_0, 0
; CHECK: [[@LINE+1]]:33: floating point constant does not have type 'float'
%1 = FADD_rnf32ri %0, float 0xH3C00
StoreRetvalF32 %1, 0
>From d07ea0be8ad3988658637359ea32e8942b65407a Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 29 May 2025 16:11:55 +0000
Subject: [PATCH 6/6] update tests
---
.../CodeGen/MIR/NVPTX/expected-floating-point-literal.mir | 2 +-
.../CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir | 4 ++--
.../CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir b/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
index 25dbc280c7a55..ef8394005943c 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/expected-floating-point-literal.mir
@@ -19,6 +19,6 @@ body: |
%0 = LD_i32 0, 4, 1, 2, 32, &test_param_0, 0
; CHECK: [[@LINE+1]]:33: expected a floating point literal
%1 = FADD_rnf32ri %0, float 3
- StoreRetvalF32 %1, 0
+ StoreRetvalI32 %1, 0
Return
...
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 76655add75882..146a45a9b1c20 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -50,7 +50,7 @@ body: |
; CHECK: %6:int32regs = FADD_rnf32ri %5, float 6.250000e+00
%6 = FADD_rnf32ri %5, float 6.250000e+00
%7 = FMUL_rnf32rr %6, %4
- StoreRetvalF32 %7, 0
+ StoreRetvalI32 %7, 0
Return
...
---
@@ -76,6 +76,6 @@ body: |
; CHECK: %6:int32regs = FADD_rnf32ri %5, float 0x7FF8000000000000
%6 = FADD_rnf32ri %5, float 0x7FF8000000000000
%7 = FMUL_rnf32rr %6, %4
- StoreRetvalF32 %7, 0
+ StoreRetvalI32 %7, 0
Return
...
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
index 3545e864450f8..c5bed1244d50e 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-invalid-type-error.mir
@@ -19,6 +19,6 @@ body: |
%0 = LD_i32 0, 4, 1, 2, 32, &test_param_0, 0
; CHECK: [[@LINE+1]]:33: floating point constant does not have type 'float'
%1 = FADD_rnf32ri %0, float 0xH3C00
- StoreRetvalF32 %1, 0
+ StoreRetvalI32 %1, 0
Return
...
More information about the llvm-commits
mailing list