[llvm-branch-commits] [llvm] [NVPTX] Add commutativity to SETP instructions to enable MachineCSE of inverted predicates (PR #191890)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Apr 20 19:58:17 PDT 2026
https://github.com/modiking updated https://github.com/llvm/llvm-project/pull/191890
>From b77ccdf6b84fb599582845d7a85c997d056fad71 Mon Sep 17 00:00:00 2001
From: root <mmo at nvidia.com>
Date: Mon, 13 Apr 2026 14:05:18 -0700
Subject: [PATCH 1/2] [NVPTX] Add commutativity to SETP instructions to enable
MachineCSE of inverted predicates
Inverted predicates can be used freely in PTX. If we can invert a
predicate and CSE the generating instruction we can save calculating
the inverse.
Teach the NVPTX commuteInstructionImpl that SETP instructions can be
inverted to allow CSEing with previous SETP that match the inverted
form. This also inverts the branch users of the predicate to maintain
correctness.
Currently only allow the SETP inversion if all users are branches.
Future work can extend this to sel and not instructions.
Made-with: Cursor
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 166 +++++
llvm/lib/Target/NVPTX/NVPTXInstrInfo.h | 8 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 7 +-
...achine-cse-predicate-inversion-bfloat16.ll | 695 ++++++++++++++++++
...machine-cse-predicate-inversion-float16.ll | 695 ++++++++++++++++++
...machine-cse-predicate-inversion-float32.ll | 663 +++++++++++++++++
...machine-cse-predicate-inversion-float64.ll | 679 +++++++++++++++++
.../machine-cse-predicate-inversion-int16.ll | 437 +++++++++++
.../machine-cse-predicate-inversion-int32.ll | 427 +++++++++++
.../machine-cse-predicate-inversion-int64.ll | 437 +++++++++++
...-cse-predicate-inversion-multiple-users.ll | 50 ++
...ne-cse-predicate-inversion-vector-float.ll | 100 +++
...ine-cse-predicate-no-inversion-bfloat16.ll | 224 ++++++
...hine-cse-predicate-no-inversion-float16.ll | 224 ++++++
...hine-cse-predicate-no-inversion-float32.ll | 214 ++++++
...hine-cse-predicate-no-inversion-float64.ll | 219 ++++++
...achine-cse-predicate-no-inversion-int16.ll | 224 ++++++
...achine-cse-predicate-no-inversion-int32.ll | 219 ++++++
...achine-cse-predicate-no-inversion-int64.ll | 224 ++++++
19 files changed, 5908 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-bfloat16.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float16.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float32.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float64.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int16.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int32.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int64.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-vector-float.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-bfloat16.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float16.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float32.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float64.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int16.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int32.ll
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int64.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 8d33876b1f8ae..495dd2518bf4d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -205,3 +205,169 @@ bool NVPTXInstrInfo::reverseBranchCondition(
Cond[1].setImm(!Cond[1].getImm());
return false;
}
+
+bool NVPTXInstrInfo::invertPredicateBranchInstr(MachineBasicBlock &MBB) const {
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 4> Cond;
+ if (analyzeBranch(MBB, TBB, FBB, Cond, /*AllowModify=*/false))
+ return false;
+ if (Cond.empty())
+ return false;
+ if (reverseBranchCondition(Cond))
+ return false;
+ DebugLoc DL = MBB.findBranchDebugLoc();
+ removeBranch(MBB);
+ insertBranch(MBB, TBB, FBB, Cond, DL);
+ return true;
+}
+
+static bool isIntegerSetp(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case NVPTX::SETP_i16rr:
+ case NVPTX::SETP_i16ri:
+ case NVPTX::SETP_i16ir:
+ case NVPTX::SETP_i32rr:
+ case NVPTX::SETP_i32ri:
+ case NVPTX::SETP_i32ir:
+ case NVPTX::SETP_i64rr:
+ case NVPTX::SETP_i64ri:
+ case NVPTX::SETP_i64ir:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool isScalarFloatSetp(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case NVPTX::SETP_bf16rr:
+ case NVPTX::SETP_f16rr:
+ case NVPTX::SETP_f32rr:
+ case NVPTX::SETP_f32ri:
+ case NVPTX::SETP_f32ir:
+ case NVPTX::SETP_f64rr:
+ case NVPTX::SETP_f64ri:
+ case NVPTX::SETP_f64ir:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int64_t invertIntegerCmpMode(int64_t Mode) {
+ switch (Mode) {
+ case NVPTX::PTXCmpMode::EQ:
+ return NVPTX::PTXCmpMode::NE;
+ case NVPTX::PTXCmpMode::NE:
+ return NVPTX::PTXCmpMode::EQ;
+ case NVPTX::PTXCmpMode::LT:
+ return NVPTX::PTXCmpMode::GE;
+ case NVPTX::PTXCmpMode::LE:
+ return NVPTX::PTXCmpMode::GT;
+ case NVPTX::PTXCmpMode::GT:
+ return NVPTX::PTXCmpMode::LE;
+ case NVPTX::PTXCmpMode::GE:
+ return NVPTX::PTXCmpMode::LT;
+ case NVPTX::PTXCmpMode::LTU:
+ return NVPTX::PTXCmpMode::GEU;
+ case NVPTX::PTXCmpMode::LEU:
+ return NVPTX::PTXCmpMode::GTU;
+ case NVPTX::PTXCmpMode::GTU:
+ return NVPTX::PTXCmpMode::LEU;
+ case NVPTX::PTXCmpMode::GEU:
+ return NVPTX::PTXCmpMode::LTU;
+ default:
+ llvm_unreachable("Invalid integer comparison mode");
+ }
+}
+
+static int64_t invertScalarFloatCmpMode(int64_t Mode) {
+ switch (Mode) {
+ case NVPTX::PTXCmpMode::EQ:
+ return NVPTX::PTXCmpMode::NEU;
+ case NVPTX::PTXCmpMode::NE:
+ return NVPTX::PTXCmpMode::EQU;
+ case NVPTX::PTXCmpMode::EQU:
+ return NVPTX::PTXCmpMode::NE;
+ case NVPTX::PTXCmpMode::NEU:
+ return NVPTX::PTXCmpMode::EQ;
+ case NVPTX::PTXCmpMode::LT:
+ return NVPTX::PTXCmpMode::GEU;
+ case NVPTX::PTXCmpMode::LE:
+ return NVPTX::PTXCmpMode::GTU;
+ case NVPTX::PTXCmpMode::GT:
+ return NVPTX::PTXCmpMode::LEU;
+ case NVPTX::PTXCmpMode::GE:
+ return NVPTX::PTXCmpMode::LTU;
+ case NVPTX::PTXCmpMode::LTU:
+ return NVPTX::PTXCmpMode::GE;
+ case NVPTX::PTXCmpMode::LEU:
+ return NVPTX::PTXCmpMode::GT;
+ case NVPTX::PTXCmpMode::GTU:
+ return NVPTX::PTXCmpMode::LE;
+ case NVPTX::PTXCmpMode::GEU:
+ return NVPTX::PTXCmpMode::LT;
+ case NVPTX::PTXCmpMode::NUM:
+ return NVPTX::PTXCmpMode::NotANumber;
+ case NVPTX::PTXCmpMode::NotANumber:
+ return NVPTX::PTXCmpMode::NUM;
+ default:
+ llvm_unreachable("Invalid scalar float comparison mode");
+ }
+}
+
+static void invertScalarCompareInstr(MachineInstr &MI) {
+ MachineOperand &ModeOp = MI.getOperand(3);
+
+ if (isIntegerSetp(MI))
+ ModeOp.setImm(invertIntegerCmpMode(ModeOp.getImm()));
+ else if (isScalarFloatSetp(MI))
+ ModeOp.setImm(invertScalarFloatCmpMode(ModeOp.getImm()));
+ else
+ llvm_unreachable("Invalid SETP instruction");
+}
+
+bool NVPTXInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const {
+ if (isIntegerSetp(MI) || isScalarFloatSetp(MI))
+ return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+ return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+}
+
+MachineInstr *NVPTXInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+ bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const {
+ assert(!NewMI && "this should never be used");
+
+ if (!isIntegerSetp(MI) && !isScalarFloatSetp(MI))
+ return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+
+ invertScalarCompareInstr(MI);
+
+ // For now all users must be invertible conditional branches.
+ // TODO: Support other users such as selects.
+ bool AllInverted = true;
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ for (MachineInstr &UseMI :
+ MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
+ if (!(UseMI.isConditionalBranch() &&
+ invertPredicateBranchInstr(*UseMI.getParent()))) {
+ AllInverted = false;
+ break;
+ }
+ }
+
+ if (!AllInverted) {
+ for (MachineInstr &UseMI :
+ MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
+ if (!(UseMI.isConditionalBranch() &&
+ invertPredicateBranchInstr(*UseMI.getParent())))
+ break;
+ }
+ invertScalarCompareInstr(MI);
+ return nullptr;
+ }
+ return &MI;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
index c0b7e77cf63a5..957fcf5cae69a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -69,6 +69,14 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
int *BytesAdded = nullptr) const override;
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
+ unsigned &SrcOpIdx2) const override;
+ MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
+ unsigned OpIdx1,
+ unsigned OpIdx2) const override;
+
+private:
+ bool invertPredicateBranchInstr(MachineBasicBlock &MBB) const;
};
} // namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8b2c600b8e7df..969c1ffbe510e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -73,8 +73,7 @@ def FTZFlag : OperandWithDefaultOps<i1, (ops (getFTZFlag (i1 0)))> {
let PrintMethod = "printFTZFlag";
}
-// Flag to indicate predicate negation (@!p) for conditional branches.
-def BranchFlag : OperandWithDefaultOps<i1, (ops (i1 0))> {
+def BranchFlag : OperandWithDefaultOps<i32, (ops (i32 0))> {
let PrintMethod = "printNegatedPredicate";
}
@@ -1573,7 +1572,7 @@ multiclass FSETP<RegTyInfo t, bit allow_ftz = true> {
defvar ftz_str = !if(allow_ftz, "$ftz", "");
defvar op_str = "setp.${cmp:FCmp}" # ftz_str # "." # t.PtxType;
defvar flags = !con((ins CmpMode:$cmp), !if(allow_ftz, (ins FTZFlag:$ftz), (ins)));
- let hasSideEffects = false in {
+ let hasSideEffects = false, isCompare = 1, isCommutable = 1 in {
def rr :
BasicFlagsNVPTXInst<(outs B1:$dst), (ins t.RC:$a, t.RC:$b),
flags, op_str>;
@@ -1599,7 +1598,7 @@ multiclass FSETP<RegTyInfo t, bit allow_ftz = true> {
multiclass ISETP<RegTyInfo t> {
defvar op_str = "setp.${cmp:ICmp}.${cmp:IType}" # t.Size;
- let hasSideEffects = false in {
+ let hasSideEffects = false, isCompare = 1, isCommutable = 1 in {
def rr :
BasicFlagsNVPTXInst<(outs B1:$dst), (ins t.RC:$a, t.RC:$b),
(ins CmpMode:$cmp), op_str>;
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-bfloat16.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-bfloat16.ll
new file mode 100644
index 0000000000000..b6b77e9c09108
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-bfloat16.ll
@@ -0,0 +1,695 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_bfloat16_eq(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_eq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_eq_param_0];
+; CHECK-NEXT: setp.eq.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ne(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ne_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ne_param_0];
+; CHECK-NEXT: setp.ne.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_oeq(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_oeq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_oeq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_oeq_param_0];
+; CHECK-NEXT: setp.eq.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_one(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_one_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_one_param_0];
+; CHECK-NEXT: setp.ne.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ueq(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ueq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ueq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ueq_param_0];
+; CHECK-NEXT: setp.equ.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_une(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_une_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_une_param_0];
+; CHECK-NEXT: setp.neu.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp une bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oeq bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_olt(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_olt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_olt_param_0];
+; CHECK-NEXT: setp.lt.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uge bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ole(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ole(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ole_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ole_param_0];
+; CHECK-NEXT: setp.le.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ole bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ogt(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ogt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ogt_param_0];
+; CHECK-NEXT: setp.gt.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ogt bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ule bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_oge(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_oge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_oge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_oge_param_0];
+; CHECK-NEXT: setp.ge.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oge bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ult bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ult(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ult_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ult_param_0];
+; CHECK-NEXT: setp.ltu.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB10_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB10_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB10_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB10_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oge bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ule(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ule_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ule_param_0];
+; CHECK-NEXT: setp.leu.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB11_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB11_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB11_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB11_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ule bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ugt(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ugt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ugt_param_0];
+; CHECK-NEXT: setp.gtu.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB12_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB12_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB12_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB12_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ugt bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ole bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_uge(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_uge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_uge_param_0];
+; CHECK-NEXT: setp.geu.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB13_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB13_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB13_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB13_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp uge bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ord(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ord(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ord_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ord_param_0];
+; CHECK-NEXT: setp.num.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @!%p1 bra $L__BB14_1;
+; CHECK-NEXT: $L__BB14_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB14_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB14_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB14_1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: bra.uni $L__BB14_2;
+entry:
+ %cmp = fcmp ord bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uno bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_uno(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_uno(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_uno_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_uno_param_0];
+; CHECK-NEXT: setp.nan.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB15_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB15_2: // %merge1
+; CHECK-NEXT: @%p1 bra $L__BB15_3;
+; CHECK-NEXT: $L__BB15_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB15_3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: bra.uni $L__BB15_4;
+entry:
+ %cmp = fcmp uno bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ord bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float16.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float16.ll
new file mode 100644
index 0000000000000..42aa05da4c76a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float16.ll
@@ -0,0 +1,695 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float16_eq(half %a, half %b) {
+; CHECK-LABEL: test_float16_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_eq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_eq_param_0];
+; CHECK-NEXT: setp.eq.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ne(half %a, half %b) {
+; CHECK-LABEL: test_float16_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ne_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ne_param_0];
+; CHECK-NEXT: setp.ne.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_oeq(half %a, half %b) {
+; CHECK-LABEL: test_float16_oeq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_oeq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_oeq_param_0];
+; CHECK-NEXT: setp.eq.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_one(half %a, half %b) {
+; CHECK-LABEL: test_float16_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_one_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_one_param_0];
+; CHECK-NEXT: setp.ne.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ueq(half %a, half %b) {
+; CHECK-LABEL: test_float16_ueq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ueq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ueq_param_0];
+; CHECK-NEXT: setp.equ.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_une(half %a, half %b) {
+; CHECK-LABEL: test_float16_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_une_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_une_param_0];
+; CHECK-NEXT: setp.neu.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp une half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oeq half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_olt(half %a, half %b) {
+; CHECK-LABEL: test_float16_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_olt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_olt_param_0];
+; CHECK-NEXT: setp.lt.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uge half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ole(half %a, half %b) {
+; CHECK-LABEL: test_float16_ole(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ole_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ole_param_0];
+; CHECK-NEXT: setp.le.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ole half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ogt(half %a, half %b) {
+; CHECK-LABEL: test_float16_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ogt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ogt_param_0];
+; CHECK-NEXT: setp.gt.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ogt half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ule half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_oge(half %a, half %b) {
+; CHECK-LABEL: test_float16_oge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_oge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_oge_param_0];
+; CHECK-NEXT: setp.ge.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oge half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ult half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ult(half %a, half %b) {
+; CHECK-LABEL: test_float16_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ult_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ult_param_0];
+; CHECK-NEXT: setp.ltu.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB10_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB10_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB10_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB10_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oge half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ule(half %a, half %b) {
+; CHECK-LABEL: test_float16_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ule_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ule_param_0];
+; CHECK-NEXT: setp.leu.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB11_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB11_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB11_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB11_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ule half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ugt(half %a, half %b) {
+; CHECK-LABEL: test_float16_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ugt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ugt_param_0];
+; CHECK-NEXT: setp.gtu.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB12_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB12_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB12_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB12_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ugt half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ole half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_uge(half %a, half %b) {
+; CHECK-LABEL: test_float16_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_uge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_uge_param_0];
+; CHECK-NEXT: setp.geu.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB13_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB13_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB13_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB13_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp uge half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ord(half %a, half %b) {
+; CHECK-LABEL: test_float16_ord(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ord_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ord_param_0];
+; CHECK-NEXT: setp.num.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @!%p1 bra $L__BB14_1;
+; CHECK-NEXT: $L__BB14_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB14_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB14_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB14_1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: bra.uni $L__BB14_2;
+entry:
+ %cmp = fcmp ord half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uno half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_uno(half %a, half %b) {
+; CHECK-LABEL: test_float16_uno(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_uno_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_uno_param_0];
+; CHECK-NEXT: setp.nan.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB15_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB15_2: // %merge1
+; CHECK-NEXT: @%p1 bra $L__BB15_3;
+; CHECK-NEXT: $L__BB15_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB15_3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: bra.uni $L__BB15_4;
+entry:
+ %cmp = fcmp uno half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ord half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float32.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float32.ll
new file mode 100644
index 0000000000000..a034ef25aea11
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float32.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float32_eq(float %arg) {
+; CHECK-LABEL: test_float32_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_eq_param_0];
+; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ne(float %arg) {
+; CHECK-LABEL: test_float32_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ne_param_0];
+; CHECK-NEXT: setp.ne.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_oeq(float %arg) {
+; CHECK-LABEL: test_float32_oeq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_oeq_param_0];
+; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_one(float %arg) {
+; CHECK-LABEL: test_float32_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_one_param_0];
+; CHECK-NEXT: setp.ne.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ueq(float %arg) {
+; CHECK-LABEL: test_float32_ueq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ueq_param_0];
+; CHECK-NEXT: setp.equ.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_une(float %arg) {
+; CHECK-LABEL: test_float32_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_une_param_0];
+; CHECK-NEXT: setp.neu.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp une float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oeq float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_olt(float %arg) {
+; CHECK-LABEL: test_float32_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_olt_param_0];
+; CHECK-NEXT: setp.lt.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uge float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ole(float %arg) {
+; CHECK-LABEL: test_float32_ole(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ole_param_0];
+; CHECK-NEXT: setp.le.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ole float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ogt(float %arg) {
+; CHECK-LABEL: test_float32_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ogt_param_0];
+; CHECK-NEXT: setp.gt.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ogt float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ule float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_oge(float %arg) {
+; CHECK-LABEL: test_float32_oge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_oge_param_0];
+; CHECK-NEXT: setp.ge.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oge float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ult float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ult(float %arg) {
+; CHECK-LABEL: test_float32_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ult_param_0];
+; CHECK-NEXT: setp.ltu.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB10_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB10_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB10_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB10_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oge float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ule(float %arg) {
+; CHECK-LABEL: test_float32_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ule_param_0];
+; CHECK-NEXT: setp.leu.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB11_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB11_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB11_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB11_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ule float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ugt(float %arg) {
+; CHECK-LABEL: test_float32_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ugt_param_0];
+; CHECK-NEXT: setp.gtu.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB12_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB12_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB12_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB12_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ugt float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ole float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_uge(float %arg) {
+; CHECK-LABEL: test_float32_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_uge_param_0];
+; CHECK-NEXT: setp.geu.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB13_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB13_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB13_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB13_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp uge float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ord(float %arg) {
+; CHECK-LABEL: test_float32_ord(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ord_param_0];
+; CHECK-NEXT: setp.num.f32 %p1, %r1, %r1;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @!%p1 bra $L__BB14_1;
+; CHECK-NEXT: $L__BB14_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB14_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB14_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB14_1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: bra.uni $L__BB14_2;
+entry:
+ %cmp = fcmp ord float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uno float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_uno(float %arg) {
+; CHECK-LABEL: test_float32_uno(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_uno_param_0];
+; CHECK-NEXT: setp.nan.f32 %p1, %r1, %r1;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB15_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB15_2: // %merge1
+; CHECK-NEXT: @%p1 bra $L__BB15_3;
+; CHECK-NEXT: $L__BB15_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB15_3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: bra.uni $L__BB15_4;
+entry:
+ %cmp = fcmp uno float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ord float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float64.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float64.ll
new file mode 100644
index 0000000000000..4106d7b9a9184
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-float64.ll
@@ -0,0 +1,679 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float64_eq(double %arg) {
+; CHECK-LABEL: test_float64_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_eq_param_0];
+; CHECK-NEXT: setp.eq.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ne(double %arg) {
+; CHECK-LABEL: test_float64_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ne_param_0];
+; CHECK-NEXT: setp.ne.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_oeq(double %arg) {
+; CHECK-LABEL: test_float64_oeq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_oeq_param_0];
+; CHECK-NEXT: setp.eq.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_one(double %arg) {
+; CHECK-LABEL: test_float64_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_one_param_0];
+; CHECK-NEXT: setp.ne.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp one double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ueq double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ueq(double %arg) {
+; CHECK-LABEL: test_float64_ueq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ueq_param_0];
+; CHECK-NEXT: setp.equ.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_une(double %arg) {
+; CHECK-LABEL: test_float64_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_une_param_0];
+; CHECK-NEXT: setp.neu.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp une double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oeq double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_olt(double %arg) {
+; CHECK-LABEL: test_float64_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_olt_param_0];
+; CHECK-NEXT: setp.lt.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uge double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ole(double %arg) {
+; CHECK-LABEL: test_float64_ole(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ole_param_0];
+; CHECK-NEXT: setp.le.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ole double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ogt(double %arg) {
+; CHECK-LABEL: test_float64_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ogt_param_0];
+; CHECK-NEXT: setp.gt.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ogt double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ule double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_oge(double %arg) {
+; CHECK-LABEL: test_float64_oge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_oge_param_0];
+; CHECK-NEXT: setp.ge.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oge double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ult double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ult(double %arg) {
+; CHECK-LABEL: test_float64_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ult_param_0];
+; CHECK-NEXT: setp.ltu.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB10_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB10_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB10_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB10_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp oge double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ule(double %arg) {
+; CHECK-LABEL: test_float64_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ule_param_0];
+; CHECK-NEXT: setp.leu.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB11_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB11_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB11_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB11_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ule double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ugt(double %arg) {
+; CHECK-LABEL: test_float64_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ugt_param_0];
+; CHECK-NEXT: setp.gtu.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB12_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB12_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB12_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB12_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ugt double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ole double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_uge(double %arg) {
+; CHECK-LABEL: test_float64_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_uge_param_0];
+; CHECK-NEXT: setp.geu.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB13_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB13_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB13_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB13_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp uge double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ord(double %arg) {
+; CHECK-LABEL: test_float64_ord(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ord_param_0];
+; CHECK-NEXT: setp.num.f64 %p1, %rd1, %rd1;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @!%p1 bra $L__BB14_1;
+; CHECK-NEXT: $L__BB14_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB14_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB14_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB14_1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: bra.uni $L__BB14_2;
+entry:
+ %cmp = fcmp ord double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp uno double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_uno(double %arg) {
+; CHECK-LABEL: test_float64_uno(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_uno_param_0];
+; CHECK-NEXT: setp.nan.f64 %p1, %rd1, %rd1;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB15_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB15_2: // %merge1
+; CHECK-NEXT: @%p1 bra $L__BB15_3;
+; CHECK-NEXT: $L__BB15_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+; CHECK-NEXT: $L__BB15_3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: bra.uni $L__BB15_4;
+entry:
+ %cmp = fcmp uno double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ord double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int16.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int16.ll
new file mode 100644
index 0000000000000..2cb3fa6110c18
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int16.ll
@@ -0,0 +1,437 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Test int16 integer comparison inversions
+
+define i32 @test_int16_eq(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_eq_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_eq_param_0];
+; CHECK-NEXT: setp.eq.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ne i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_ne(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_ne_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_ne_param_0];
+; CHECK-NEXT: setp.ne.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ne i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp eq i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_slt(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_slt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_slt_param_0];
+; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sge i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_sle(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_sle(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_sle_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_sle_param_0];
+; CHECK-NEXT: setp.le.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sle i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sgt i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_sgt(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_sgt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_sgt_param_0];
+; CHECK-NEXT: setp.gt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sgt i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sle i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_sge(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_sge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_sge_param_0];
+; CHECK-NEXT: setp.ge.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sge i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp slt i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_ult(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_ult_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_ult_param_0];
+; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp uge i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_ule(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_ule_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_ule_param_0];
+; CHECK-NEXT: setp.le.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ule i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ugt i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_ugt(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_ugt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_ugt_param_0];
+; CHECK-NEXT: setp.gt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ugt i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ule i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_uge(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_uge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_uge_param_0];
+; CHECK-NEXT: setp.ge.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp uge i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ult i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int32.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int32.ll
new file mode 100644
index 0000000000000..4d1f7c97fe594
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int32.ll
@@ -0,0 +1,427 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Test int32 integer comparison inversions
+
+define i32 @test_int32_eq(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_eq_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_eq_param_0];
+; CHECK-NEXT: setp.eq.b32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ne i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_ne(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_ne_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_ne_param_0];
+; CHECK-NEXT: setp.ne.b32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ne i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp eq i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_slt(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_slt_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_slt_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sge i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_sle(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_sle(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_sle_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_sle_param_0];
+; CHECK-NEXT: setp.le.s32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sle i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sgt i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_sgt(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_sgt_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_sgt_param_0];
+; CHECK-NEXT: setp.gt.s32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sgt i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sle i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_sge(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_sge_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_sge_param_0];
+; CHECK-NEXT: setp.ge.s32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sge i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp slt i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_ult(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_ult_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_ult_param_0];
+; CHECK-NEXT: setp.lt.u32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp uge i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_ule(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_ule_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_ule_param_0];
+; CHECK-NEXT: setp.le.u32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ule i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ugt i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_ugt(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_ugt_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_ugt_param_0];
+; CHECK-NEXT: setp.gt.u32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ugt i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ule i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_uge(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_uge_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_uge_param_0];
+; CHECK-NEXT: setp.ge.u32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp uge i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ult i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int64.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int64.ll
new file mode 100644
index 0000000000000..bc90ea4335d2e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-int64.ll
@@ -0,0 +1,437 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+; Test int64 integer comparison inversions
+
+define i32 @test_int64_eq(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_eq_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_eq_param_0];
+; CHECK-NEXT: setp.eq.b64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ne i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_ne(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_ne(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_ne_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_ne_param_0];
+; CHECK-NEXT: setp.ne.b64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ne i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp eq i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_slt(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_slt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_slt_param_0];
+; CHECK-NEXT: setp.lt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sge i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_sle(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_sle(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_sle_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_sle_param_0];
+; CHECK-NEXT: setp.le.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sle i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sgt i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_sgt(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_sgt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_sgt_param_0];
+; CHECK-NEXT: setp.gt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sgt i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sle i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_sge(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_sge_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_sge_param_0];
+; CHECK-NEXT: setp.ge.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB5_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB5_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB5_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB5_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp sge i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp slt i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_ult(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_ult_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_ult_param_0];
+; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB6_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB6_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB6_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB6_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp uge i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_ule(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_ule_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_ule_param_0];
+; CHECK-NEXT: setp.le.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB7_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB7_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB7_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB7_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ule i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ugt i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_ugt(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_ugt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_ugt_param_0];
+; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB8_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB8_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB8_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB8_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ugt i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ule i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_uge(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_uge_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_uge_param_0];
+; CHECK-NEXT: setp.ge.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB9_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB9_2: // %merge1
+; CHECK-NEXT: @!%p1 bra $L__BB9_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB9_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp uge i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ult i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll
new file mode 100644
index 0000000000000..bf4a77f599061
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_multiple_users(i32 %a, i32 %b) {
+; CHECK-LABEL: test_multiple_users(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r3, [test_multiple_users_param_1];
+; CHECK-NEXT: ld.param.b32 %r2, [test_multiple_users_param_0];
+; CHECK-NEXT: setp.eq.b32 %p1, %r2, %r3;
+; CHECK-NEXT: mov.b32 %r5, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r5, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
+; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r5, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: add.s32 %r4, %r5, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %val = select i1 %cmp, i32 1, i32 0
+ br i1 %cmp, label %else, label %merge2
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ %ret = add i32 %phi2, %val
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-vector-float.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-vector-float.ll
new file mode 100644
index 0000000000000..85df5da09a0e7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-vector-float.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+; RUN: %if ptxas-sm_100 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -O2 | %ptxas-verify -arch=sm_100 %}
+
+; NOTE: Currently only scalar SETP predicate inversions are optimized by MachineCSE.
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float_f16x2_eq(i32 %arg) {
+; CHECK-LABEL: test_float_f16x2_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float_f16x2_eq_param_0];
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: setp.eq.f16x2 %p2|%p3, %r1, %r2;
+; CHECK-NEXT: and.pred %p1, %p2, %p3;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: not.pred %p4, %p1;
+; CHECK-NEXT: @%p4 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %a = bitcast i32 %arg to <2 x half>
+ %zero = bitcast i32 0 to <2 x half>
+ %cmp = fcmp oeq <2 x half> %a, %zero
+ %e0 = extractelement <2 x i1> %cmp, i32 0
+ %e1 = extractelement <2 x i1> %cmp, i32 1
+ %and = and i1 %e0, %e1
+ br i1 %and, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ br i1 %and, label %else, label %merge2
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float_bf16x2_eq(i32 %arg) {
+; CHECK-LABEL: test_float_bf16x2_eq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float_bf16x2_eq_param_0];
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: setp.eq.bf16x2 %p2|%p3, %r1, %r2;
+; CHECK-NEXT: and.pred %p1, %p2, %p3;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: not.pred %p4, %p1;
+; CHECK-NEXT: @%p4 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %a = bitcast i32 %arg to <2 x bfloat>
+ %zero = bitcast i32 0 to <2 x bfloat>
+ %cmp = fcmp oeq <2 x bfloat> %a, %zero
+ %e0 = extractelement <2 x i1> %cmp, i32 0
+ %e1 = extractelement <2 x i1> %cmp, i32 1
+ %and = and i1 %e0, %e1
+ br i1 %and, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ br i1 %and, label %else, label %merge2
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-bfloat16.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-bfloat16.ll
new file mode 100644
index 0000000000000..9e966e1843811
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-bfloat16.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_bfloat16_ueq_vs_une(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ueq_vs_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ueq_vs_une_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ueq_vs_une_param_0];
+; CHECK-NEXT: setp.equ.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.neu.bf16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_oeq_vs_one(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_oeq_vs_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_oeq_vs_one_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_oeq_vs_one_param_0];
+; CHECK-NEXT: setp.eq.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.ne.bf16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_olt_vs_ogt(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_olt_vs_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_olt_vs_ogt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_olt_vs_ogt_param_0];
+; CHECK-NEXT: setp.lt.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.gt.bf16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_ult_vs_ugt(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_ult_vs_ugt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.ltu.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.gtu.bf16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_bfloat16_oeq_vs_olt(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_bfloat16_oeq_vs_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_bfloat16_oeq_vs_olt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_bfloat16_oeq_vs_olt_param_0];
+; CHECK-NEXT: setp.eq.bf16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.lt.bf16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq bfloat %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt bfloat %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float16.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float16.ll
new file mode 100644
index 0000000000000..6ea217ab97b53
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float16.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float16_ueq_vs_une(half %a, half %b) {
+; CHECK-LABEL: test_float16_ueq_vs_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ueq_vs_une_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ueq_vs_une_param_0];
+; CHECK-NEXT: setp.equ.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.neu.f16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_oeq_vs_one(half %a, half %b) {
+; CHECK-LABEL: test_float16_oeq_vs_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_oeq_vs_one_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_oeq_vs_one_param_0];
+; CHECK-NEXT: setp.eq.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.ne.f16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_olt_vs_ogt(half %a, half %b) {
+; CHECK-LABEL: test_float16_olt_vs_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_olt_vs_ogt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_olt_vs_ogt_param_0];
+; CHECK-NEXT: setp.lt.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.gt.f16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_ult_vs_ugt(half %a, half %b) {
+; CHECK-LABEL: test_float16_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_ult_vs_ugt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.ltu.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.gtu.f16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float16_oeq_vs_olt(half %a, half %b) {
+; CHECK-LABEL: test_float16_oeq_vs_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_float16_oeq_vs_olt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_float16_oeq_vs_olt_param_0];
+; CHECK-NEXT: setp.eq.f16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.lt.f16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq half %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt half %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float32.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float32.ll
new file mode 100644
index 0000000000000..9008782b70b25
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float32.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float32_ueq_vs_une(float %arg) {
+; CHECK-LABEL: test_float32_ueq_vs_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ueq_vs_une_param_0];
+; CHECK-NEXT: setp.equ.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.neu.f32 %p2, %r1, 0f00000000;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_oeq_vs_one(float %arg) {
+; CHECK-LABEL: test_float32_oeq_vs_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_oeq_vs_one_param_0];
+; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.ne.f32 %p2, %r1, 0f00000000;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_olt_vs_ogt(float %arg) {
+; CHECK-LABEL: test_float32_olt_vs_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_olt_vs_ogt_param_0];
+; CHECK-NEXT: setp.lt.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.gt.f32 %p2, %r1, 0f00000000;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_ult_vs_ugt(float %arg) {
+; CHECK-LABEL: test_float32_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.ltu.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.gtu.f32 %p2, %r1, 0f00000000;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float32_oeq_vs_olt(float %arg) {
+; CHECK-LABEL: test_float32_oeq_vs_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r1, [test_float32_oeq_vs_olt_param_0];
+; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r2, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.lt.f32 %p2, %r1, 0f00000000;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r2, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq float %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt float %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float64.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float64.ll
new file mode 100644
index 0000000000000..1c6ab1ef6f093
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-float64.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_float64_ueq_vs_une(double %arg) {
+; CHECK-LABEL: test_float64_ueq_vs_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ueq_vs_une_param_0];
+; CHECK-NEXT: setp.equ.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.neu.f64 %p2, %rd1, 0d0000000000000000;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ueq double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp une double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_oeq_vs_one(double %arg) {
+; CHECK-LABEL: test_float64_oeq_vs_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_oeq_vs_one_param_0];
+; CHECK-NEXT: setp.eq.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.ne.f64 %p2, %rd1, 0d0000000000000000;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp one double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_olt_vs_ogt(double %arg) {
+; CHECK-LABEL: test_float64_olt_vs_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_olt_vs_ogt_param_0];
+; CHECK-NEXT: setp.lt.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.gt.f64 %p2, %rd1, 0d0000000000000000;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp olt double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ogt double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_ult_vs_ugt(double %arg) {
+; CHECK-LABEL: test_float64_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.ltu.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.gtu.f64 %p2, %rd1, 0d0000000000000000;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp ult double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp ugt double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_float64_oeq_vs_olt(double %arg) {
+; CHECK-LABEL: test_float64_oeq_vs_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd1, [test_float64_oeq_vs_olt_param_0];
+; CHECK-NEXT: setp.eq.f64 %p1, %rd1, 0d0000000000000000;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.lt.f64 %p2, %rd1, 0d0000000000000000;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = fcmp oeq double %arg, 0.000000e+00
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = fcmp olt double %arg, 0.000000e+00
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int16.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int16.ll
new file mode 100644
index 0000000000000..e9656967225e3
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int16.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_int16_slt_vs_sgt(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_slt_vs_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_slt_vs_sgt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_slt_vs_sgt_param_0];
+; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.gt.s16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sgt i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_ult_vs_ugt(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_ult_vs_ugt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.gt.u16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ugt i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_slt_vs_uge(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_slt_vs_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_slt_vs_uge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_slt_vs_uge_param_0];
+; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.ge.u16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp uge i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_eq_vs_slt(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_eq_vs_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_eq_vs_slt_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_eq_vs_slt_param_0];
+; CHECK-NEXT: setp.eq.b16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.lt.s16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp slt i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int16_ult_vs_sge(i16 %a, i16 %b) {
+; CHECK-LABEL: test_int16_ult_vs_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b16 %rs2, [test_int16_ult_vs_sge_param_1];
+; CHECK-NEXT: ld.param.b16 %rs1, [test_int16_ult_vs_sge_param_0];
+; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.ge.s16 %p2, %rs1, %rs2;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i16 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sge i16 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int32.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int32.ll
new file mode 100644
index 0000000000000..3820ea527dc05
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int32.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_int32_slt_vs_sgt(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_slt_vs_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_slt_vs_sgt_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_slt_vs_sgt_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.gt.s32 %p2, %r1, %r2;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sgt i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_ult_vs_ugt(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_ult_vs_ugt_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.lt.u32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.gt.u32 %p2, %r1, %r2;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ugt i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_slt_vs_uge(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_slt_vs_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_slt_vs_uge_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_slt_vs_uge_param_0];
+; CHECK-NEXT: setp.lt.s32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.ge.u32 %p2, %r1, %r2;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp uge i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_eq_vs_slt(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_eq_vs_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_eq_vs_slt_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_eq_vs_slt_param_0];
+; CHECK-NEXT: setp.eq.b32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.lt.s32 %p2, %r1, %r2;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp slt i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int32_ult_vs_sge(i32 %a, i32 %b) {
+; CHECK-LABEL: test_int32_ult_vs_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b32 %r2, [test_int32_ult_vs_sge_param_1];
+; CHECK-NEXT: ld.param.b32 %r1, [test_int32_ult_vs_sge_param_0];
+; CHECK-NEXT: setp.lt.u32 %p1, %r1, %r2;
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r3, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.ge.s32 %p2, %r1, %r2;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r3, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i32 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sge i32 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int64.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int64.ll
new file mode 100644
index 0000000000000..9989929f8f5db
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-no-inversion-int64.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -march=nvptx64 -mcpu=sm_100 -O2 < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define i32 @test_int64_slt_vs_sgt(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_slt_vs_sgt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_slt_vs_sgt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_slt_vs_sgt_param_0];
+; CHECK-NEXT: setp.lt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB0_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB0_2: // %merge1
+; CHECK-NEXT: setp.gt.s64 %p2, %rd1, %rd2;
+; CHECK-NEXT: @%p2 bra $L__BB0_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB0_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sgt i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_ult_vs_ugt(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_ult_vs_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_ult_vs_ugt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_ult_vs_ugt_param_0];
+; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB1_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB1_2: // %merge1
+; CHECK-NEXT: setp.gt.u64 %p2, %rd1, %rd2;
+; CHECK-NEXT: @%p2 bra $L__BB1_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB1_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp ugt i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_slt_vs_uge(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_slt_vs_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_slt_vs_uge_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_slt_vs_uge_param_0];
+; CHECK-NEXT: setp.lt.s64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB2_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB2_2: // %merge1
+; CHECK-NEXT: setp.ge.u64 %p2, %rd1, %rd2;
+; CHECK-NEXT: @%p2 bra $L__BB2_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB2_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp slt i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp uge i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_eq_vs_slt(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_eq_vs_slt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_eq_vs_slt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_eq_vs_slt_param_0];
+; CHECK-NEXT: setp.eq.b64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB3_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB3_2: // %merge1
+; CHECK-NEXT: setp.lt.s64 %p2, %rd1, %rd2;
+; CHECK-NEXT: @%p2 bra $L__BB3_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB3_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp eq i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp slt i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
+
+define i32 @test_int64_ult_vs_sge(i64 %a, i64 %b) {
+; CHECK-LABEL: test_int64_ult_vs_sge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.b64 %rd2, [test_int64_ult_vs_sge_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_int64_ult_vs_sge_param_0];
+; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd2;
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: @%p1 bra $L__BB4_2;
+; CHECK-NEXT: // %bb.1: // %then
+; CHECK-NEXT: mov.b32 %r1, 1;
+; CHECK-NEXT: $L__BB4_2: // %merge1
+; CHECK-NEXT: setp.ge.s64 %p2, %rd1, %rd2;
+; CHECK-NEXT: @%p2 bra $L__BB4_4;
+; CHECK-NEXT: // %bb.3: // %else
+; CHECK-NEXT: mov.b32 %r1, 0;
+; CHECK-NEXT: $L__BB4_4: // %merge2
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %cmp = icmp ult i64 %a, %b
+ br i1 %cmp, label %merge1, label %then
+
+then:
+ %tmp = load i32, ptr addrspace(1) null, align 4
+ br label %merge1
+
+merge1:
+ %phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
+ %cmp2 = icmp sge i64 %a, %b
+ br i1 %cmp2, label %merge2, label %else
+
+else:
+ br label %merge2
+
+merge2:
+ %phi2 = phi i32 [ %phi1, %merge1 ], [ 0, %else ]
+ ret i32 %phi2
+}
>From b0b3e5082509eb482bcec712ca837b3ed927a62f Mon Sep 17 00:00:00 2001
From: Modi Mo <mmo at nvidia.com>
Date: Mon, 20 Apr 2026 18:43:48 -0700
Subject: [PATCH 2/2] update rollback logic and add test exercising it
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 36 +++++-----
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 +-
...-cse-predicate-inversion-multiple-users.ll | 16 +++--
...chine-cse-predicate-inversion-rollback.mir | 66 +++++++++++++++++++
4 files changed, 93 insertions(+), 27 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-rollback.mir
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 495dd2518bf4d..d9900e98fe694 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -344,30 +344,28 @@ MachineInstr *NVPTXInstrInfo::commuteInstructionImpl(MachineInstr &MI,
if (!isIntegerSetp(MI) && !isScalarFloatSetp(MI))
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
- invertScalarCompareInstr(MI);
-
// For now all users must be invertible conditional branches.
// TODO: Support other users such as selects.
- bool AllInverted = true;
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ SmallVector<MachineBasicBlock *, 4> BranchMBBs;
for (MachineInstr &UseMI :
MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
- if (!(UseMI.isConditionalBranch() &&
- invertPredicateBranchInstr(*UseMI.getParent()))) {
- AllInverted = false;
- break;
- }
+ if (!UseMI.isConditionalBranch())
+ return nullptr;
+ BranchMBBs.push_back(UseMI.getParent());
}
- if (!AllInverted) {
- for (MachineInstr &UseMI :
- MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
- if (!(UseMI.isConditionalBranch() &&
- invertPredicateBranchInstr(*UseMI.getParent())))
- break;
- }
- invertScalarCompareInstr(MI);
- return nullptr;
- }
- return &MI;
+ invertScalarCompareInstr(MI);
+ auto *Failed = llvm::find_if(BranchMBBs, [this](MachineBasicBlock *MBB) {
+ return !invertPredicateBranchInstr(*MBB);
+ });
+ if (Failed == BranchMBBs.end())
+ return &MI;
+
+ // Couldn't invert one of the branches. Roll back the prefix we
+ // already inverted and the compare-mode flip.
+ for (MachineBasicBlock *MBB : make_range(BranchMBBs.begin(), Failed))
+ invertPredicateBranchInstr(*MBB);
+ invertScalarCompareInstr(MI);
+ return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 969c1ffbe510e..5ca7941a77a7d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -73,7 +73,7 @@ def FTZFlag : OperandWithDefaultOps<i1, (ops (getFTZFlag (i1 0)))> {
let PrintMethod = "printFTZFlag";
}
-def BranchFlag : OperandWithDefaultOps<i32, (ops (i32 0))> {
+def BranchFlag : OperandWithDefaultOps<i32, (ops (i1 0))> {
let PrintMethod = "printNegatedPredicate";
}
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll
index bf4a77f599061..7b71053758147 100644
--- a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-multiple-users.ll
@@ -7,7 +7,7 @@ target triple = "nvptx64-nvidia-cuda"
define i32 @test_multiple_users(i32 %a, i32 %b) {
; CHECK-LABEL: test_multiple_users(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
@@ -19,8 +19,9 @@ define i32 @test_multiple_users(i32 %a, i32 %b) {
; CHECK-NEXT: // %bb.1: // %then
; CHECK-NEXT: mov.b32 %r5, 1;
; CHECK-NEXT: $L__BB0_2: // %merge1
-; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
-; CHECK-NEXT: @!%p1 bra $L__BB0_4;
+; CHECK-NEXT: setp.ne.b32 %p2, %r2, %r3;
+; CHECK-NEXT: selp.b32 %r1, 1, 0, %p2;
+; CHECK-NEXT: @%p1 bra $L__BB0_4;
; CHECK-NEXT: // %bb.3: // %else
; CHECK-NEXT: mov.b32 %r5, 0;
; CHECK-NEXT: $L__BB0_4: // %merge2
@@ -28,8 +29,8 @@ define i32 @test_multiple_users(i32 %a, i32 %b) {
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
entry:
- %cmp = icmp eq i32 %a, %b
- br i1 %cmp, label %merge1, label %then
+ %cmp1 = icmp eq i32 %a, %b
+ br i1 %cmp1, label %merge1, label %then
then:
%tmp = load i32, ptr addrspace(1) null, align 4
@@ -37,8 +38,9 @@ then:
merge1:
%phi1 = phi i32 [ 1, %then ], [ 0, %entry ]
- %val = select i1 %cmp, i32 1, i32 0
- br i1 %cmp, label %else, label %merge2
+ %cmp2 = icmp ne i32 %a, %b
+ %val = select i1 %cmp2, i32 1, i32 0
+ br i1 %cmp2, label %else, label %merge2
else:
br label %merge2
diff --git a/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-rollback.mir b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-rollback.mir
new file mode 100644
index 0000000000000..0a4e99da61939
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/machine-cse-predicate-inversion-rollback.mir
@@ -0,0 +1,66 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=nvptx64 -mcpu=sm_100 -run-pass=machine-cse -o - %s | FileCheck %s
+
+---
+name: rollback
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: rollback
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:b32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:b32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[SETP_i32rr:%[0-9]+]]:b1 = SETP_i32rr [[DEF]], [[DEF1]], 0
+ ; CHECK-NEXT: CBranch [[SETP_i32rr]], %bb.4, 0
+ ; CHECK-NEXT: GOTO %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[SETP_i32rr1:%[0-9]+]]:b1 = SETP_i32rr [[DEF]], [[DEF1]], 1
+ ; CHECK-NEXT: CBranch [[SETP_i32rr1]], %bb.4, 0
+ ; CHECK-NEXT: GOTO %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: CBranch [[SETP_i32rr1]], %bb.4, 0
+ ; CHECK-NEXT: CBranch [[SETP_i32rr1]], %bb.4, 0
+ ; CHECK-NEXT: GOTO %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: Return
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4:
+ ; CHECK-NEXT: Return
+ bb.0:
+ successors: %bb.1, %bb.4
+
+ %0:b32 = IMPLICIT_DEF
+ %1:b32 = IMPLICIT_DEF
+ %2:b1 = SETP_i32rr %0, %1, 0
+ CBranch %2, %bb.4, 0
+ GOTO %bb.1
+
+ bb.1:
+ successors: %bb.2, %bb.4
+
+ %3:b1 = SETP_i32rr %0, %1, 1
+ CBranch %3, %bb.4, 0
+ GOTO %bb.2
+
+ bb.2:
+ successors: %bb.3, %bb.4
+
+ CBranch %3, %bb.4, 0
+ CBranch %3, %bb.4, 0
+ GOTO %bb.3
+
+ bb.3:
+ Return
+
+ bb.4:
+ Return
+
+...
More information about the llvm-branch-commits
mailing list