[llvm] [NVPTX] Lower i1 select with logical ops in the general case (PR #135868)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 16 07:47:42 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/135868
>From b521b6b4a8c2ba38247ec7b75905348ecba7846c Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Fri, 11 Apr 2025 18:46:13 +0000
Subject: [PATCH 1/3] pre-commit tests
---
llvm/test/CodeGen/NVPTX/bug22246.ll | 22 ++++-
llvm/test/CodeGen/NVPTX/i1-select.ll | 130 +++++++++++++++++++++++++++
2 files changed, 150 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/i1-select.ll
diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index e0051d33f0b30..bdaffa5d6056e 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -1,13 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
-; CHECK-LABEL: _Z3foobbbPb
define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr nocapture %output) {
+; CHECK-LABEL: _Z3foobbbPb(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: ld.param.u8 %r1, [_Z3foobbbPb_param_2];
+; CHECK-NEXT: ld.param.u8 %r2, [_Z3foobbbPb_param_1];
+; CHECK-NEXT: selp.b32 %r3, %r2, %r1, %p1;
+; CHECK-NEXT: cvt.u16.u32 %rs3, %r3;
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
+; CHECK-NEXT: st.u8 [%rd1], %rs4;
+; CHECK-NEXT: ret;
entry:
-; CHECK: selp.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %p{{[0-9]+}}
%.sink.v = select i1 %p1, i1 %p2, i1 %p3
%frombool5 = zext i1 %.sink.v to i8
store i8 %frombool5, ptr %output, align 1
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
new file mode 100644
index 0000000000000..53e1dcface25d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx-nvidia-cuda"
+
+define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false) {
+; CHECK-LABEL: test_select_i1_trunc(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<10>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_param_0];
+; CHECK-NEXT: and.b32 %r2, %r1, 1;
+; CHECK-NEXT: setp.ne.b32 %p1, %r2, 0;
+; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_trunc_param_1];
+; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_trunc_param_2];
+; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_trunc_param_3];
+; CHECK-NEXT: selp.b32 %r6, %r3, %r4, %p1;
+; CHECK-NEXT: and.b32 %r7, %r6, 1;
+; CHECK-NEXT: setp.ne.b32 %p2, %r7, 0;
+; CHECK-NEXT: ld.param.u32 %r8, [test_select_i1_trunc_param_4];
+; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT: ret;
+ %a_trunc = trunc i32 %a to i1
+ %b_trunc = trunc i32 %b to i1
+ %c_trunc = trunc i32 %c to i1
+ %select_i1 = select i1 %a_trunc, i1 %b_trunc, i1 %c_trunc
+ %select_ret = select i1 %select_i1, i32 %true, i32 %false
+ ret i32 %select_ret
+}
+
+define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false) {
+; CHECK-LABEL: test_select_i1_trunc_2(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_select_i1_trunc_2_param_0];
+; CHECK-NEXT: and.b64 %rd2, %rd1, 1;
+; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0;
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_2_param_2];
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_trunc_2_param_3];
+; CHECK-NEXT: ld.param.u16 %r3, [test_select_i1_trunc_2_param_1];
+; CHECK-NEXT: selp.b32 %r4, %r3, %r1, %p1;
+; CHECK-NEXT: and.b32 %r5, %r4, 1;
+; CHECK-NEXT: setp.ne.b32 %p2, %r5, 0;
+; CHECK-NEXT: ld.param.u32 %r6, [test_select_i1_trunc_2_param_4];
+; CHECK-NEXT: selp.b32 %r7, %r2, %r6, %p2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT: ret;
+ %a_trunc = trunc i64 %a to i1
+ %b_trunc = trunc i16 %b to i1
+ %c_trunc = trunc i32 %c to i1
+ %select_i1 = select i1 %a_trunc, i1 %b_trunc, i1 %c_trunc
+ %select_ret = select i1 %select_i1, i32 %true, i32 %false
+ ret i32 %select_ret
+}
+
+define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
+; CHECK-LABEL: test_select_i1_basic(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<5>;
+; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_param_0];
+; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0;
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_param_1];
+; CHECK-NEXT: setp.eq.s32 %p2, %r2, 0;
+; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_basic_param_2];
+; CHECK-NEXT: setp.eq.s32 %p3, %r3, 0;
+; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_basic_param_3];
+; CHECK-NEXT: selp.b32 %r5, -1, 0, %p3;
+; CHECK-NEXT: selp.b32 %r6, -1, 0, %p2;
+; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p1;
+; CHECK-NEXT: and.b32 %r8, %r7, 1;
+; CHECK-NEXT: setp.ne.b32 %p4, %r8, 0;
+; CHECK-NEXT: ld.param.u32 %r9, [test_select_i1_basic_param_4];
+; CHECK-NEXT: selp.b32 %r10, %r4, %r9, %p4;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-NEXT: ret;
+ %b1 = icmp eq i32 %v1, 0
+ %b2 = icmp eq i32 %v2, 0
+ %b3 = icmp eq i32 %v3, 0
+ %select_i1 = select i1 %b1, i1 %b2, i1 %b3
+ %select_ret = select i1 %select_i1, i32 %true, i32 %false
+ ret i32 %select_ret
+}
+
+define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
+; CHECK-LABEL: test_select_i1_basic_folding(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<8>;
+; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_folding_param_0];
+; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0;
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT: setp.eq.s32 %p2, %r2, 0;
+; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT: setp.eq.s32 %p3, %r3, 0;
+; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT: xor.pred %p4, %p1, %p2;
+; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT: and.pred %p5, %p4, %p3;
+; CHECK-NEXT: selp.b32 %r6, -1, 0, %p5;
+; CHECK-NEXT: selp.b32 %r7, -1, 0, %p3;
+; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p2;
+; CHECK-NEXT: and.b32 %r9, %r8, 1;
+; CHECK-NEXT: setp.ne.b32 %p6, %r9, 0;
+; CHECK-NEXT: xor.pred %p7, %p6, %p2;
+; CHECK-NEXT: selp.b32 %r10, %r4, %r5, %p7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-NEXT: ret;
+ %b1 = icmp eq i32 %v1, 0
+ %b2 = icmp eq i32 %v2, 0
+ %b3 = icmp eq i32 %v3, 0
+ %b4 = xor i1 %b1, %b2
+ %b5 = and i1 %b4, %b3
+ %select_i1 = select i1 %b2, i1 %b5, i1 %b3
+ %b6 = xor i1 %select_i1, %b2
+ %select_ret = select i1 %b6, i32 %true, i32 %false
+ ret i32 %select_ret
+}
>From abc14791cd6a315f2ee93f4ad0ba9e3a63e33e64 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Tue, 15 Apr 2025 16:22:38 +0000
Subject: [PATCH 2/3] [NVPTX] Lower i1 select with logical ops in the general
case
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 52 ++--
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 2 -
llvm/test/CodeGen/NVPTX/bug22246.ll | 14 +-
llvm/test/CodeGen/NVPTX/i1-select.ll | 76 +++--
llvm/test/CodeGen/NVPTX/i128.ll | 300 ++++++++++----------
5 files changed, 222 insertions(+), 222 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9bde2a976e164..abe4c27009698 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2850,6 +2850,40 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
return DAG.getSelect(DL, Ty, IsInf, X, Sub);
}
+static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) {
+ assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
+
+ SDValue Cond = Op->getOperand(0);
+ SDValue TrueVal = Op->getOperand(1);
+ SDValue FalseVal = Op->getOperand(2);
+ SDLoc DL(Op);
+
+ // If both operands are truncated, we push the select through the truncates.
+ if (TrueVal.getOpcode() == ISD::TRUNCATE &&
+ FalseVal.getOpcode() == ISD::TRUNCATE) {
+ TrueVal = TrueVal.getOperand(0);
+ FalseVal = FalseVal.getOperand(0);
+
+ EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
+ ? TrueVal.getValueType()
+ : FalseVal.getValueType();
+ TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
+ FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
+ SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
+ }
+
+ // Otherwise, expand the select into a series of logical operations. These
+ // often can be folded into other operations either by us or ptxas.
+ TrueVal = DAG.getFreeze(TrueVal);
+ FalseVal = DAG.getFreeze(FalseVal);
+ SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
+ SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+ SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
+ return Or;
+}
+
SDValue
NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -2889,7 +2923,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS:
return LowerShiftRightParts(Op, DAG);
case ISD::SELECT:
- return LowerSelect(Op, DAG);
+ return lowerSELECT(Op, DAG);
case ISD::FROUND:
return LowerFROUND(Op, DAG);
case ISD::FCOPYSIGN:
@@ -3056,22 +3090,6 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MachinePointerInfo(SV));
}
-SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
- SDValue Op0 = Op->getOperand(0);
- SDValue Op1 = Op->getOperand(1);
- SDValue Op2 = Op->getOperand(2);
- SDLoc DL(Op.getNode());
-
- assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
-
- Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
- Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
- SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
-
- return Trunc;
-}
-
SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::i1)
return LowerLOADi1(Op, DAG);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index dd90746f6d9d6..7a8bf3bf33a94 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -324,8 +324,6 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
-
SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index bdaffa5d6056e..321d30c38b183 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -9,21 +9,19 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc
; CHECK-LABEL: _Z3foobbbPb(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b16 %rs<7>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT: ld.param.u8 %r1, [_Z3foobbbPb_param_2];
-; CHECK-NEXT: ld.param.u8 %r2, [_Z3foobbbPb_param_1];
-; CHECK-NEXT: selp.b32 %r3, %r2, %r1, %p1;
-; CHECK-NEXT: cvt.u16.u32 %rs3, %r3;
-; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: ld.param.u8 %rs3, [_Z3foobbbPb_param_1];
+; CHECK-NEXT: ld.param.u8 %rs4, [_Z3foobbbPb_param_2];
+; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1;
+; CHECK-NEXT: and.b16 %rs6, %rs5, 1;
; CHECK-NEXT: ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
-; CHECK-NEXT: st.u8 [%rd1], %rs4;
+; CHECK-NEXT: st.u8 [%rd1], %rs6;
; CHECK-NEXT: ret;
entry:
%.sink.v = select i1 %p1, i1 %p2, i1 %p3
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index 53e1dcface25d..d24b06c4d721c 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -36,22 +36,23 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false
; CHECK-LABEL: test_select_i1_trunc_2(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
-; CHECK-NEXT: .reg .b32 %r<8>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_select_i1_trunc_2_param_0];
; CHECK-NEXT: and.b64 %rd2, %rd1, 1;
; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0;
-; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_2_param_2];
-; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_trunc_2_param_3];
-; CHECK-NEXT: ld.param.u16 %r3, [test_select_i1_trunc_2_param_1];
-; CHECK-NEXT: selp.b32 %r4, %r3, %r1, %p1;
-; CHECK-NEXT: and.b32 %r5, %r4, 1;
-; CHECK-NEXT: setp.ne.b32 %p2, %r5, 0;
-; CHECK-NEXT: ld.param.u32 %r6, [test_select_i1_trunc_2_param_4];
-; CHECK-NEXT: selp.b32 %r7, %r2, %r6, %p2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT: ld.param.u16 %rs1, [test_select_i1_trunc_2_param_1];
+; CHECK-NEXT: ld.param.u16 %rs2, [test_select_i1_trunc_2_param_2];
+; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_2_param_3];
+; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0;
+; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_trunc_2_param_4];
+; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%a_trunc = trunc i64 %a to i1
%b_trunc = trunc i16 %b to i1
@@ -64,25 +65,23 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false
define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
; CHECK-LABEL: test_select_i1_basic(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<5>;
-; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_param_0];
-; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0;
; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_param_1];
-; CHECK-NEXT: setp.eq.s32 %p2, %r2, 0;
-; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_basic_param_2];
-; CHECK-NEXT: setp.eq.s32 %p3, %r3, 0;
-; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_basic_param_3];
-; CHECK-NEXT: selp.b32 %r5, -1, 0, %p3;
-; CHECK-NEXT: selp.b32 %r6, -1, 0, %p2;
-; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p1;
-; CHECK-NEXT: and.b32 %r8, %r7, 1;
-; CHECK-NEXT: setp.ne.b32 %p4, %r8, 0;
-; CHECK-NEXT: ld.param.u32 %r9, [test_select_i1_basic_param_4];
-; CHECK-NEXT: selp.b32 %r10, %r4, %r9, %p4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-NEXT: or.b32 %r4, %r1, %r2;
+; CHECK-NEXT: setp.ne.s32 %p1, %r1, 0;
+; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_param_2];
+; CHECK-NEXT: setp.eq.s32 %p2, %r5, 0;
+; CHECK-NEXT: ld.param.u32 %r7, [test_select_i1_basic_param_3];
+; CHECK-NEXT: setp.eq.s32 %p3, %r4, 0;
+; CHECK-NEXT: ld.param.u32 %r8, [test_select_i1_basic_param_4];
+; CHECK-NEXT: selp.b32 %r9, %r7, %r8, %p2;
+; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
+; CHECK-NEXT: selp.b32 %r11, %r7, %r10, %p3;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
@@ -95,28 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
; CHECK-LABEL: test_select_i1_basic_folding(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<8>;
-; CHECK-NEXT: .reg .b32 %r<11>;
+; CHECK-NEXT: .reg .pred %p<13>;
+; CHECK-NEXT: .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_folding_param_0];
; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0;
; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT: setp.eq.s32 %p2, %r2, 0;
+; CHECK-NEXT: setp.ne.s32 %p2, %r2, 0;
+; CHECK-NEXT: setp.eq.s32 %p3, %r2, 0;
; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT: setp.eq.s32 %p3, %r3, 0;
+; CHECK-NEXT: setp.eq.s32 %p4, %r3, 0;
; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_basic_folding_param_3];
-; CHECK-NEXT: xor.pred %p4, %p1, %p2;
+; CHECK-NEXT: xor.pred %p6, %p1, %p3;
; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_folding_param_4];
-; CHECK-NEXT: and.pred %p5, %p4, %p3;
-; CHECK-NEXT: selp.b32 %r6, -1, 0, %p5;
-; CHECK-NEXT: selp.b32 %r7, -1, 0, %p3;
-; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p2;
-; CHECK-NEXT: and.b32 %r9, %r8, 1;
-; CHECK-NEXT: setp.ne.b32 %p6, %r9, 0;
-; CHECK-NEXT: xor.pred %p7, %p6, %p2;
-; CHECK-NEXT: selp.b32 %r10, %r4, %r5, %p7;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
+; CHECK-NEXT: and.pred %p7, %p6, %p4;
+; CHECK-NEXT: and.pred %p9, %p2, %p4;
+; CHECK-NEXT: and.pred %p10, %p3, %p7;
+; CHECK-NEXT: or.pred %p11, %p10, %p9;
+; CHECK-NEXT: xor.pred %p12, %p11, %p3;
+; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r6;
; CHECK-NEXT: ret;
%b1 = icmp eq i32 %v1, 0
%b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index bf6189c280191..64786e601c4b5 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,8 +5,8 @@
define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: srem_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<19>;
-; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-NEXT: .reg .pred %p<20>;
+; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-NEXT: .reg .b64 %rd<127>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
@@ -45,51 +45,48 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7;
; CHECK-NEXT: sub.cc.s64 %rd66, %rd61, %rd65;
; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0;
-; CHECK-NEXT: setp.eq.s64 %p8, %rd67, 0;
-; CHECK-NEXT: setp.ne.s64 %p9, %rd67, 0;
-; CHECK-NEXT: selp.b32 %r5, -1, 0, %p9;
-; CHECK-NEXT: setp.gt.u64 %p10, %rd66, 127;
-; CHECK-NEXT: selp.b32 %r6, -1, 0, %p10;
-; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p8;
-; CHECK-NEXT: and.b32 %r8, %r7, 1;
-; CHECK-NEXT: setp.ne.b32 %p11, %r8, 0;
-; CHECK-NEXT: or.pred %p12, %p5, %p11;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.s64 %p11, %rd67, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
; CHECK-NEXT: xor.b64 %rd68, %rd66, 127;
; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67;
-; CHECK-NEXT: setp.eq.s64 %p13, %rd69, 0;
-; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p12;
-; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p12;
-; CHECK-NEXT: or.pred %p14, %p12, %p13;
-; CHECK-NEXT: @%p14 bra $L__BB0_5;
+; CHECK-NEXT: setp.eq.s64 %p14, %rd69, 0;
+; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB0_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1;
; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0;
; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120;
-; CHECK-NEXT: setp.eq.s64 %p15, %rd72, 0;
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd66;
-; CHECK-NEXT: sub.s32 %r10, 127, %r9;
-; CHECK-NEXT: shl.b64 %rd73, %rd4, %r10;
-; CHECK-NEXT: sub.s32 %r11, 64, %r10;
-; CHECK-NEXT: shr.u64 %rd74, %rd3, %r11;
+; CHECK-NEXT: setp.eq.s64 %p16, %rd72, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd66;
+; CHECK-NEXT: sub.s32 %r6, 127, %r5;
+; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6;
+; CHECK-NEXT: sub.s32 %r7, 64, %r6;
+; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7;
; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74;
-; CHECK-NEXT: sub.s32 %r12, 63, %r9;
-; CHECK-NEXT: shl.b64 %rd76, %rd3, %r12;
-; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63;
-; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p16;
-; CHECK-NEXT: shl.b64 %rd123, %rd3, %r10;
+; CHECK-NEXT: sub.s32 %r8, 63, %r5;
+; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6;
; CHECK-NEXT: mov.b64 %rd114, %rd117;
-; CHECK-NEXT: @%p15 bra $L__BB0_4;
+; CHECK-NEXT: @%p16 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r13, %rd119;
-; CHECK-NEXT: shr.u64 %rd79, %rd3, %r13;
-; CHECK-NEXT: sub.s32 %r14, 64, %r13;
-; CHECK-NEXT: shl.b64 %rd80, %rd4, %r14;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9;
+; CHECK-NEXT: sub.s32 %r10, 64, %r9;
+; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10;
; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80;
-; CHECK-NEXT: add.s32 %r15, %r13, -64;
-; CHECK-NEXT: shr.u64 %rd82, %rd4, %r15;
-; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63;
-; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p17;
-; CHECK-NEXT: shr.u64 %rd122, %rd4, %r13;
+; CHECK-NEXT: add.s32 %r11, %r9, -64;
+; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1;
; CHECK-NEXT: mov.b64 %rd114, 0;
@@ -119,8 +116,8 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1;
; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1;
; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120;
-; CHECK-NEXT: setp.eq.s64 %p18, %rd98, 0;
-; CHECK-NEXT: @%p18 bra $L__BB0_4;
+; CHECK-NEXT: setp.eq.s64 %p19, %rd98, 0;
+; CHECK-NEXT: @%p19 bra $L__BB0_4;
; CHECK-NEXT: bra.uni $L__BB0_2;
; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit
; CHECK-NEXT: shr.u64 %rd99, %rd123, 63;
@@ -149,8 +146,8 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: urem_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<17>;
-; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-NEXT: .reg .pred %p<18>;
+; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-NEXT: .reg .b64 %rd<113>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
@@ -178,51 +175,48 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: mov.b64 %rd103, 0;
; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0;
-; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0;
-; CHECK-NEXT: setp.ne.s64 %p7, %rd57, 0;
-; CHECK-NEXT: selp.b32 %r5, -1, 0, %p7;
-; CHECK-NEXT: setp.gt.u64 %p8, %rd56, 127;
-; CHECK-NEXT: selp.b32 %r6, -1, 0, %p8;
-; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p6;
-; CHECK-NEXT: and.b32 %r8, %r7, 1;
-; CHECK-NEXT: setp.ne.b32 %p9, %r8, 0;
-; CHECK-NEXT: or.pred %p10, %p3, %p9;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.s64 %p7, %rd57, 0;
+; CHECK-NEXT: and.pred %p8, %p7, %p6;
+; CHECK-NEXT: setp.ne.s64 %p9, %rd57, 0;
+; CHECK-NEXT: or.pred %p10, %p8, %p9;
+; CHECK-NEXT: or.pred %p11, %p3, %p10;
; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT: setp.eq.s64 %p11, %rd59, 0;
-; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p10;
-; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p10;
-; CHECK-NEXT: or.pred %p12, %p10, %p11;
-; CHECK-NEXT: @%p12 bra $L__BB1_5;
+; CHECK-NEXT: setp.eq.s64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11;
+; CHECK-NEXT: or.pred %p13, %p11, %p12;
+; CHECK-NEXT: @%p13 bra $L__BB1_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1;
; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0;
; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106;
-; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0;
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd56;
-; CHECK-NEXT: sub.s32 %r10, 127, %r9;
-; CHECK-NEXT: shl.b64 %rd63, %rd42, %r10;
-; CHECK-NEXT: sub.s32 %r11, 64, %r10;
-; CHECK-NEXT: shr.u64 %rd64, %rd41, %r11;
+; CHECK-NEXT: setp.eq.s64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
+; CHECK-NEXT: sub.s32 %r6, 127, %r5;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
+; CHECK-NEXT: sub.s32 %r7, 64, %r6;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
-; CHECK-NEXT: sub.s32 %r12, 63, %r9;
-; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12;
-; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63;
-; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p14;
-; CHECK-NEXT: shl.b64 %rd109, %rd41, %r10;
+; CHECK-NEXT: sub.s32 %r8, 63, %r5;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
+; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6;
; CHECK-NEXT: mov.b64 %rd100, %rd103;
-; CHECK-NEXT: @%p13 bra $L__BB1_4;
+; CHECK-NEXT: @%p14 bra $L__BB1_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r13, %rd105;
-; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13;
-; CHECK-NEXT: sub.s32 %r14, 64, %r13;
-; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
+; CHECK-NEXT: sub.s32 %r10, 64, %r9;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
-; CHECK-NEXT: add.s32 %r15, %r13, -64;
-; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15;
-; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63;
-; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p15;
-; CHECK-NEXT: shr.u64 %rd108, %rd42, %r13;
+; CHECK-NEXT: add.s32 %r11, %r9, -64;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
+; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1;
; CHECK-NEXT: mov.b64 %rd100, 0;
@@ -252,8 +246,8 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1;
; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1;
; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106;
-; CHECK-NEXT: setp.eq.s64 %p16, %rd88, 0;
-; CHECK-NEXT: @%p16 bra $L__BB1_4;
+; CHECK-NEXT: setp.eq.s64 %p17, %rd88, 0;
+; CHECK-NEXT: @%p17 bra $L__BB1_4;
; CHECK-NEXT: bra.uni $L__BB1_2;
; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit
; CHECK-NEXT: shr.u64 %rd89, %rd109, 63;
@@ -313,8 +307,8 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: sdiv_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<19>;
-; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-NEXT: .reg .pred %p<20>;
+; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-NEXT: .reg .b64 %rd<122>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
@@ -354,51 +348,48 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: selp.b64 %rd66, %rd63, %rd65, %p7;
; CHECK-NEXT: sub.cc.s64 %rd67, %rd62, %rd66;
; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0;
-; CHECK-NEXT: setp.eq.s64 %p8, %rd68, 0;
-; CHECK-NEXT: setp.ne.s64 %p9, %rd68, 0;
-; CHECK-NEXT: selp.b32 %r5, -1, 0, %p9;
-; CHECK-NEXT: setp.gt.u64 %p10, %rd67, 127;
-; CHECK-NEXT: selp.b32 %r6, -1, 0, %p10;
-; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p8;
-; CHECK-NEXT: and.b32 %r8, %r7, 1;
-; CHECK-NEXT: setp.ne.b32 %p11, %r8, 0;
-; CHECK-NEXT: or.pred %p12, %p5, %p11;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0;
+; CHECK-NEXT: and.pred %p10, %p9, %p8;
+; CHECK-NEXT: setp.ne.s64 %p11, %rd68, 0;
+; CHECK-NEXT: or.pred %p12, %p10, %p11;
+; CHECK-NEXT: or.pred %p13, %p5, %p12;
; CHECK-NEXT: xor.b64 %rd69, %rd67, 127;
; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68;
-; CHECK-NEXT: setp.eq.s64 %p13, %rd70, 0;
-; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p12;
-; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p12;
-; CHECK-NEXT: or.pred %p14, %p12, %p13;
-; CHECK-NEXT: @%p14 bra $L__BB4_5;
+; CHECK-NEXT: setp.eq.s64 %p14, %rd70, 0;
+; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT: or.pred %p15, %p13, %p14;
+; CHECK-NEXT: @%p15 bra $L__BB4_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1;
; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0;
; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115;
-; CHECK-NEXT: setp.eq.s64 %p15, %rd73, 0;
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd67;
-; CHECK-NEXT: sub.s32 %r10, 127, %r9;
-; CHECK-NEXT: shl.b64 %rd74, %rd2, %r10;
-; CHECK-NEXT: sub.s32 %r11, 64, %r10;
-; CHECK-NEXT: shr.u64 %rd75, %rd1, %r11;
+; CHECK-NEXT: setp.eq.s64 %p16, %rd73, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd67;
+; CHECK-NEXT: sub.s32 %r6, 127, %r5;
+; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6;
+; CHECK-NEXT: sub.s32 %r7, 64, %r6;
+; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7;
; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT: sub.s32 %r12, 63, %r9;
-; CHECK-NEXT: shl.b64 %rd77, %rd1, %r12;
-; CHECK-NEXT: setp.gt.s32 %p16, %r10, 63;
-; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p16;
-; CHECK-NEXT: shl.b64 %rd118, %rd1, %r10;
+; CHECK-NEXT: sub.s32 %r8, 63, %r5;
+; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6;
; CHECK-NEXT: mov.b64 %rd109, %rd112;
-; CHECK-NEXT: @%p15 bra $L__BB4_4;
+; CHECK-NEXT: @%p16 bra $L__BB4_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r13, %rd114;
-; CHECK-NEXT: shr.u64 %rd80, %rd1, %r13;
-; CHECK-NEXT: sub.s32 %r14, 64, %r13;
-; CHECK-NEXT: shl.b64 %rd81, %rd2, %r14;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9;
+; CHECK-NEXT: sub.s32 %r10, 64, %r9;
+; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10;
; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81;
-; CHECK-NEXT: add.s32 %r15, %r13, -64;
-; CHECK-NEXT: shr.u64 %rd83, %rd2, %r15;
-; CHECK-NEXT: setp.gt.s32 %p17, %r13, 63;
-; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p17;
-; CHECK-NEXT: shr.u64 %rd117, %rd2, %r13;
+; CHECK-NEXT: add.s32 %r11, %r9, -64;
+; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9;
; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1;
; CHECK-NEXT: mov.b64 %rd109, 0;
@@ -428,8 +419,8 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1;
; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1;
; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115;
-; CHECK-NEXT: setp.eq.s64 %p18, %rd99, 0;
-; CHECK-NEXT: @%p18 bra $L__BB4_4;
+; CHECK-NEXT: setp.eq.s64 %p19, %rd99, 0;
+; CHECK-NEXT: @%p19 bra $L__BB4_4;
; CHECK-NEXT: bra.uni $L__BB4_2;
; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit
; CHECK-NEXT: shr.u64 %rd100, %rd118, 63;
@@ -452,8 +443,8 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: udiv_i128(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<17>;
-; CHECK-NEXT: .reg .b32 %r<16>;
+; CHECK-NEXT: .reg .pred %p<18>;
+; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-NEXT: .reg .b64 %rd<107>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
@@ -481,51 +472,48 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: mov.b64 %rd97, 0;
; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54;
; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0;
-; CHECK-NEXT: setp.eq.s64 %p6, %rd57, 0;
-; CHECK-NEXT: setp.ne.s64 %p7, %rd57, 0;
-; CHECK-NEXT: selp.b32 %r5, -1, 0, %p7;
-; CHECK-NEXT: setp.gt.u64 %p8, %rd56, 127;
-; CHECK-NEXT: selp.b32 %r6, -1, 0, %p8;
-; CHECK-NEXT: selp.b32 %r7, %r6, %r5, %p6;
-; CHECK-NEXT: and.b32 %r8, %r7, 1;
-; CHECK-NEXT: setp.ne.b32 %p9, %r8, 0;
-; CHECK-NEXT: or.pred %p10, %p3, %p9;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT: setp.eq.s64 %p7, %rd57, 0;
+; CHECK-NEXT: and.pred %p8, %p7, %p6;
+; CHECK-NEXT: setp.ne.s64 %p9, %rd57, 0;
+; CHECK-NEXT: or.pred %p10, %p8, %p9;
+; CHECK-NEXT: or.pred %p11, %p3, %p10;
; CHECK-NEXT: xor.b64 %rd58, %rd56, 127;
; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57;
-; CHECK-NEXT: setp.eq.s64 %p11, %rd59, 0;
-; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p10;
-; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p10;
-; CHECK-NEXT: or.pred %p12, %p10, %p11;
-; CHECK-NEXT: @%p12 bra $L__BB5_5;
+; CHECK-NEXT: setp.eq.s64 %p12, %rd59, 0;
+; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11;
+; CHECK-NEXT: or.pred %p13, %p11, %p12;
+; CHECK-NEXT: @%p13 bra $L__BB5_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1;
; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0;
; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100;
-; CHECK-NEXT: setp.eq.s64 %p13, %rd62, 0;
-; CHECK-NEXT: cvt.u32.u64 %r9, %rd56;
-; CHECK-NEXT: sub.s32 %r10, 127, %r9;
-; CHECK-NEXT: shl.b64 %rd63, %rd42, %r10;
-; CHECK-NEXT: sub.s32 %r11, 64, %r10;
-; CHECK-NEXT: shr.u64 %rd64, %rd41, %r11;
+; CHECK-NEXT: setp.eq.s64 %p14, %rd62, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd56;
+; CHECK-NEXT: sub.s32 %r6, 127, %r5;
+; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6;
+; CHECK-NEXT: sub.s32 %r7, 64, %r6;
+; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7;
; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64;
-; CHECK-NEXT: sub.s32 %r12, 63, %r9;
-; CHECK-NEXT: shl.b64 %rd66, %rd41, %r12;
-; CHECK-NEXT: setp.gt.s32 %p14, %r10, 63;
-; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p14;
-; CHECK-NEXT: shl.b64 %rd103, %rd41, %r10;
+; CHECK-NEXT: sub.s32 %r8, 63, %r5;
+; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8;
+; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
+; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6;
; CHECK-NEXT: mov.b64 %rd94, %rd97;
-; CHECK-NEXT: @%p13 bra $L__BB5_4;
+; CHECK-NEXT: @%p14 bra $L__BB5_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
-; CHECK-NEXT: cvt.u32.u64 %r13, %rd99;
-; CHECK-NEXT: shr.u64 %rd69, %rd41, %r13;
-; CHECK-NEXT: sub.s32 %r14, 64, %r13;
-; CHECK-NEXT: shl.b64 %rd70, %rd42, %r14;
+; CHECK-NEXT: cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9;
+; CHECK-NEXT: sub.s32 %r10, 64, %r9;
+; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10;
; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70;
-; CHECK-NEXT: add.s32 %r15, %r13, -64;
-; CHECK-NEXT: shr.u64 %rd72, %rd42, %r15;
-; CHECK-NEXT: setp.gt.s32 %p15, %r13, 63;
-; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p15;
-; CHECK-NEXT: shr.u64 %rd102, %rd42, %r13;
+; CHECK-NEXT: add.s32 %r11, %r9, -64;
+; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11;
+; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
+; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9;
; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1;
; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1;
; CHECK-NEXT: mov.b64 %rd94, 0;
@@ -555,8 +543,8 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1;
; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1;
; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100;
-; CHECK-NEXT: setp.eq.s64 %p16, %rd88, 0;
-; CHECK-NEXT: @%p16 bra $L__BB5_4;
+; CHECK-NEXT: setp.eq.s64 %p17, %rd88, 0;
+; CHECK-NEXT: @%p17 bra $L__BB5_4;
; CHECK-NEXT: bra.uni $L__BB5_2;
; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit
; CHECK-NEXT: shr.u64 %rd89, %rd103, 63;
>From b09cb1768d690916773f7a880528e0e3ae9d4f1e Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Wed, 16 Apr 2025 14:47:30 +0000
Subject: [PATCH 3/3] update tests
---
llvm/test/CodeGen/NVPTX/bug22246.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll
index 321d30c38b183..0080aafcf5631 100644
--- a/llvm/test/CodeGen/NVPTX/bug22246.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22246.ll
@@ -15,7 +15,7 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
; CHECK-NEXT: ld.param.u8 %rs3, [_Z3foobbbPb_param_1];
; CHECK-NEXT: ld.param.u8 %rs4, [_Z3foobbbPb_param_2];
; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1;
More information about the llvm-commits
mailing list