[llvm] [DAGCombine] Propagate truncate to operands (PR #98666)
Justin Fargnoli via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 23 16:14:06 PDT 2024
https://github.com/justinfargnoli updated https://github.com/llvm/llvm-project/pull/98666
>From d8372ca09e0969c441e34cb1799f75369ed3ddea Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 30 Jun 2024 18:50:30 -0700
Subject: [PATCH 01/32] Initial commit
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 54 ++++++++++++++++++++-
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 10 ++++
2 files changed, 63 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/NVPTX/combine-truncate.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a2181b478c269..e0f4348975bfe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::VSELECT});
+ ISD::TRUNCATE, ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5601,6 +5601,52 @@ static SDValue PerformREMCombine(SDNode *N,
return SDValue();
}
+// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
+// This will reduce register pressure.
+static SDValue PerformTruncCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalizeOps()) {
+ SDValue LogicalOp = N->getOperand(0);
+ switch (LogicalOp.getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ EVT VT = N->getValueType(0);
+ EVT LogicalVT = LogicalOp.getValueType();
+ if (VT == MVT::i32 && LogicalVT == MVT::i64) {
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (VT.isScalarInteger() ||
+ TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
+ if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+ return U->isMachineOpcode() ?
+ U->getMachineOpcode() == NVPTX::CVT_u32_u64 :
+ U->getOpcode() == ISD::TRUNCATE;
+ })) {
+
+ SDLoc DL(N);
+ SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE,
+ DL, MVT::i32);
+ SDNode *NarrowL = DCI.DAG.getMachineNode(
+ NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
+ SDNode *NarrowR = DCI.DAG.getMachineNode(
+ NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
+ return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+ SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+ return SDValue();
+}
+
enum OperandSignedness {
Signed = 0,
Unsigned,
@@ -6017,6 +6063,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UREM:
case ISD::SREM:
return PerformREMCombine(N, DCI, OptLevel);
+ case ISD::TRUNCATE:
+ return PerformTruncCombine(N, DCI);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
@@ -6034,6 +6082,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VSELECT:
return PerformVSELECTCombine(N, DCI);
}
+
+ if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64)
+ return PerformTruncCombine(N, DCI);
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
new file mode 100644
index 0000000000000..f43887064acc0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+
+define i32 @foo(i64 %a, i64 %b) {
+; CHECK: or.b32
+; CHECK-NOT: or.b64
+entry:
+ %or = or i64 %a, %b
+ %trunc = trunc i64 %or to i32
+ ret i32 %trunc
+}
>From a6b26498b147bc51dcd953db2328bd1134cb16f8 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:07:53 -0700
Subject: [PATCH 02/32] Prefer early return
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 71 +++++++++++----------
1 file changed, 36 insertions(+), 35 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index e0f4348975bfe..2303a74cb1c0f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5605,45 +5605,46 @@ static SDValue PerformREMCombine(SDNode *N,
// This will reduce register pressure.
static SDValue PerformTruncCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
- if (DCI.isBeforeLegalizeOps()) {
- SDValue LogicalOp = N->getOperand(0);
- switch (LogicalOp.getOpcode()) {
- default:
- break;
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL:
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR: {
- EVT VT = N->getValueType(0);
- EVT LogicalVT = LogicalOp.getValueType();
- if (VT == MVT::i32 && LogicalVT == MVT::i64) {
- const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
- if (VT.isScalarInteger() ||
- TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
- if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
- return U->isMachineOpcode() ?
- U->getMachineOpcode() == NVPTX::CVT_u32_u64 :
- U->getOpcode() == ISD::TRUNCATE;
- })) {
-
- SDLoc DL(N);
- SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE,
- DL, MVT::i32);
- SDNode *NarrowL = DCI.DAG.getMachineNode(
- NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
- SDNode *NarrowR = DCI.DAG.getMachineNode(
- NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
- return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
- SDValue(NarrowL, 0), SDValue(NarrowR, 0));
- }
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue LogicalOp = N->getOperand(0);
+ switch (LogicalOp.getOpcode()) {
+ default:
+ break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ EVT VT = N->getValueType(0);
+ EVT LogicalVT = LogicalOp.getValueType();
+ if (VT == MVT::i32 && LogicalVT == MVT::i64) {
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (VT.isScalarInteger() ||
+ TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
+ if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+ return U->isMachineOpcode() ?
+ U->getMachineOpcode() == NVPTX::CVT_u32_u64 :
+ U->getOpcode() == ISD::TRUNCATE;
+ })) {
+ SDLoc DL(N);
+ SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE,
+ DL, MVT::i32);
+ SDNode *NarrowL = DCI.DAG.getMachineNode(
+ NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
+ SDNode *NarrowR = DCI.DAG.getMachineNode(
+ NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
+ return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+ SDValue(NarrowL, 0), SDValue(NarrowR, 0));
}
}
- break;
- }
}
+ break;
}
+ }
+
return SDValue();
}
>From 3f3adc2a1a8188bf5c1873e13c5dab0c2a068465 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:08:15 -0700
Subject: [PATCH 03/32] clang-format
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2303a74cb1c0f..bdd48e59f4666 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5602,7 +5602,7 @@ static SDValue PerformREMCombine(SDNode *N,
}
// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
-// This will reduce register pressure.
+// This will reduce register pressure.
static SDValue PerformTruncCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
if (!DCI.isBeforeLegalizeOps())
@@ -5610,7 +5610,7 @@ static SDValue PerformTruncCombine(SDNode *N,
SDValue LogicalOp = N->getOperand(0);
switch (LogicalOp.getOpcode()) {
- default:
+ default:
break;
case ISD::ADD:
case ISD::SUB:
@@ -5625,19 +5625,19 @@ static SDValue PerformTruncCombine(SDNode *N,
if (VT.isScalarInteger() ||
TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
- return U->isMachineOpcode() ?
- U->getMachineOpcode() == NVPTX::CVT_u32_u64 :
- U->getOpcode() == ISD::TRUNCATE;
- })) {
+ return U->isMachineOpcode()
+ ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+ : U->getOpcode() == ISD::TRUNCATE;
+ })) {
SDLoc DL(N);
- SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE,
- DL, MVT::i32);
+ SDValue None =
+ DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
SDNode *NarrowL = DCI.DAG.getMachineNode(
NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
SDNode *NarrowR = DCI.DAG.getMachineNode(
NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
- SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+ SDValue(NarrowL, 0), SDValue(NarrowR, 0));
}
}
}
>From 3db69a8bdc79449a2829e71a36fb7e9c23c1d678 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:17:55 -0700
Subject: [PATCH 04/32] Add negative test for `ISD::TRUNCATE`
---
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index f43887064acc0..3d0827606dbc2 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -1,6 +1,7 @@
; RUN: llc < %s | FileCheck %s
-define i32 @foo(i64 %a, i64 %b) {
+define i32 @trunc(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc
; CHECK: or.b32
; CHECK-NOT: or.b64
entry:
@@ -8,3 +9,14 @@ entry:
%trunc = trunc i64 %or to i32
ret i32 %trunc
}
+
+define i32 @trunc_not(i64 %a, i64 %b, ptr %p) {
+; CHECK-LABEL: trunc_not
+; CHECK: or.b64
+; CHECK-NOT: or.b32
+entry:
+ %or = or i64 %a, %b
+ %trunc = trunc i64 %or to i32
+ store i64 %or, ptr %p
+ ret i32 %trunc
+}
>From 0ffe6132893ab3e3b3383a28ea60a5498fe53cb2 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:42:56 -0700
Subject: [PATCH 05/32] Add cvt and cvt_not test
---
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 90 ++++++++++++++++++---
1 file changed, 79 insertions(+), 11 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 3d0827606dbc2..1e0d63ca24880 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -1,22 +1,90 @@
-; RUN: llc < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
define i32 @trunc(i64 %a, i64 %b) {
-; CHECK-LABEL: trunc
-; CHECK: or.b32
-; CHECK-NOT: or.b64
-entry:
+; CHECK-LABEL: trunc(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [trunc_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [trunc_param_1];
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT: or.b32 %r3, %r2, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
%or = or i64 %a, %b
%trunc = trunc i64 %or to i32
ret i32 %trunc
}
-define i32 @trunc_not(i64 %a, i64 %b, ptr %p) {
-; CHECK-LABEL: trunc_not
-; CHECK: or.b64
-; CHECK-NOT: or.b32
-entry:
+define i32 @trunc_not(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_not(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [trunc_not_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [trunc_not_param_1];
+; CHECK-NEXT: or.b64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd3;
+; CHECK-NEXT: mov.u64 %rd4, 0;
+; CHECK-NEXT: st.u64 [%rd4], %rd3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
%or = or i64 %a, %b
%trunc = trunc i64 %or to i32
- store i64 %or, ptr %p
+ store i64 %or, ptr null
+ ret i32 %trunc
+}
+
+define i32 @trunc_cvt(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_cvt(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_param_1];
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT: add.s32 %r3, %r2, %r1;
+; CHECK-NEXT: or.b32 %r4, %r3, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: ret;
+ %add = add i64 %a, %b
+ %or = or i64 %add, %a
+ %trunc = trunc i64 %or to i32
+ ret i32 %trunc
+}
+
+define i32 @trunc_cvt_not(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_cvt_not(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_not_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_not_param_1];
+; CHECK-NEXT: mov.u64 %rd3, 0;
+; CHECK-NEXT: st.u64 [%rd3], %rd2;
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT: add.s32 %r3, %r2, %r1;
+; CHECK-NEXT: or.b32 %r4, %r3, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: ret;
+ %add = add i64 %a, %b
+ store i64 %b, ptr null
+ %or = or i64 %add, %a
+ %trunc = trunc i64 %or to i32
ret i32 %trunc
}
>From d574becb7709de80bf59470f3bf47f29350a2f00 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:52:32 -0700
Subject: [PATCH 06/32] Prefer early return
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 44 ++++++++++-----------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bdd48e59f4666..51a6c62077442 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5620,28 +5620,28 @@ static SDValue PerformTruncCombine(SDNode *N,
case ISD::XOR: {
EVT VT = N->getValueType(0);
EVT LogicalVT = LogicalOp.getValueType();
- if (VT == MVT::i32 && LogicalVT == MVT::i64) {
- const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
- if (VT.isScalarInteger() ||
- TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
- if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
- return U->isMachineOpcode()
- ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
- : U->getOpcode() == ISD::TRUNCATE;
- })) {
- SDLoc DL(N);
- SDValue None =
- DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
- SDNode *NarrowL = DCI.DAG.getMachineNode(
- NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
- SDNode *NarrowR = DCI.DAG.getMachineNode(
- NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
- return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
- SDValue(NarrowL, 0), SDValue(NarrowR, 0));
- }
- }
- }
- break;
+ if (VT != MVT::i32 || LogicalVT != MVT::i64)
+ break;
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (!VT.isScalarInteger() &&
+ !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
+ break;
+ if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+ return U->isMachineOpcode()
+ ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+ : U->getOpcode() == ISD::TRUNCATE;
+ }))
+ break;
+
+ SDLoc DL(N);
+ SDValue CVTNone =
+ DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
+ SDNode *NarrowL = DCI.DAG.getMachineNode(
+ NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), CVTNone);
+ SDNode *NarrowR = DCI.DAG.getMachineNode(
+ NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), CVTNone);
+ return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+ SDValue(NarrowL, 0), SDValue(NarrowR, 0));
}
}
>From 503a162d5b75bb992d924b30c8bc78dfe8d5c9dc Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:53:08 -0700
Subject: [PATCH 07/32] clang-format
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 51a6c62077442..4ee888a711af2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5620,28 +5620,28 @@ static SDValue PerformTruncCombine(SDNode *N,
case ISD::XOR: {
EVT VT = N->getValueType(0);
EVT LogicalVT = LogicalOp.getValueType();
- if (VT != MVT::i32 || LogicalVT != MVT::i64)
+ if (VT != MVT::i32 || LogicalVT != MVT::i64)
break;
const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
- if (!VT.isScalarInteger() &&
+ if (!VT.isScalarInteger() &&
!TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
break;
if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
return U->isMachineOpcode()
- ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
- : U->getOpcode() == ISD::TRUNCATE;
+ ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+ : U->getOpcode() == ISD::TRUNCATE;
}))
break;
SDLoc DL(N);
SDValue CVTNone =
DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
- SDNode *NarrowL = DCI.DAG.getMachineNode(
- NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), CVTNone);
- SDNode *NarrowR = DCI.DAG.getMachineNode(
- NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), CVTNone);
- return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
- SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+ SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
+ LogicalOp.getOperand(0), CVTNone);
+ SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
+ LogicalOp.getOperand(1), CVTNone);
+ return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0),
+ SDValue(NarrowR, 0));
}
}
>From 23842ca9ba967abc5b21356bfb3061b963147880 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:55:20 -0700
Subject: [PATCH 08/32] Correct trunc_cvt_not test
---
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 1e0d63ca24880..30e415ebe9527 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -68,22 +68,22 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
define i32 @trunc_cvt_not(i64 %a, i64 %b) {
; CHECK-LABEL: trunc_cvt_not(
; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_not_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_not_param_1];
-; CHECK-NEXT: mov.u64 %rd3, 0;
-; CHECK-NEXT: st.u64 [%rd3], %rd2;
-; CHECK-NEXT: cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT: mov.u64 %rd4, 0;
+; CHECK-NEXT: st.u64 [%rd4], %rd3;
+; CHECK-NEXT: cvt.u32.u64 %r1, %rd3;
; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT: add.s32 %r3, %r2, %r1;
-; CHECK-NEXT: or.b32 %r4, %r3, %r2;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT: or.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
; CHECK-NEXT: ret;
%add = add i64 %a, %b
- store i64 %b, ptr null
+ store i64 %add, ptr null
%or = or i64 %add, %a
%trunc = trunc i64 %or to i32
ret i32 %trunc
>From 571b61f519e6d66345d0e737eb8543cf51b73be6 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 15 Jul 2024 14:48:10 -0700
Subject: [PATCH 09/32] Delete NVPTXISelLowering implementation
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 54 +--------------------
1 file changed, 1 insertion(+), 53 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4ee888a711af2..9429d95a46c5d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::TRUNCATE, ISD::VSELECT});
+ ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5601,53 +5601,6 @@ static SDValue PerformREMCombine(SDNode *N,
return SDValue();
}
-// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
-// This will reduce register pressure.
-static SDValue PerformTruncCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
-
- SDValue LogicalOp = N->getOperand(0);
- switch (LogicalOp.getOpcode()) {
- default:
- break;
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL:
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR: {
- EVT VT = N->getValueType(0);
- EVT LogicalVT = LogicalOp.getValueType();
- if (VT != MVT::i32 || LogicalVT != MVT::i64)
- break;
- const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
- if (!VT.isScalarInteger() &&
- !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
- break;
- if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
- return U->isMachineOpcode()
- ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
- : U->getOpcode() == ISD::TRUNCATE;
- }))
- break;
-
- SDLoc DL(N);
- SDValue CVTNone =
- DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
- SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
- LogicalOp.getOperand(0), CVTNone);
- SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
- LogicalOp.getOperand(1), CVTNone);
- return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0),
- SDValue(NarrowR, 0));
- }
- }
-
- return SDValue();
-}
-
enum OperandSignedness {
Signed = 0,
Unsigned,
@@ -6064,8 +6017,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UREM:
case ISD::SREM:
return PerformREMCombine(N, DCI, OptLevel);
- case ISD::TRUNCATE:
- return PerformTruncCombine(N, DCI);
case ISD::SETCC:
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
@@ -6084,9 +6035,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformVSELECTCombine(N, DCI);
}
- if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64)
- return PerformTruncCombine(N, DCI);
-
return SDValue();
}
>From e7015f3a10cb90f6e045b899b487ff5f1f3f8483 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 15 Jul 2024 15:12:58 -0700
Subject: [PATCH 10/32] Perform optimziation via DAGCombine + TLI
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++--
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 4 ++++
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 20 +++++++------------
4 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 55b60b01e5827..49e25d2df6600 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2975,6 +2975,8 @@ class TargetLoweringBase {
return isTruncateFree(Val.getValueType(), VT2);
}
+ virtual bool shouldReduceRegisterPressure() const { return false; }
+
virtual bool isProfitableToHoist(Instruction *I) const { return true; }
/// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 765f1e1f5f68c..30126d51bf706 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5797,7 +5797,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
}
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
- if (HandOpcode == ISD::TRUNCATE) {
+ if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -15155,7 +15155,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::OR:
case ISD::XOR:
if (!LegalOperations && N0.hasOneUse() &&
- (isConstantOrConstantVector(N0.getOperand(0), true) ||
+ (TLI.shouldReduceRegisterPressure() ||
+ isConstantOrConstantVector(N0.getOperand(0), true) ||
isConstantOrConstantVector(N0.getOperand(1), true))) {
// TODO: We already restricted this to pre-legalization, but for vectors
// we are extra cautious to not create an unsupported operation.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 63262961b363e..43891369424de 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -502,6 +502,10 @@ class NVPTXTargetLowering : public TargetLowering {
DstTy->getPrimitiveSizeInBits() == 32;
}
+ bool shouldReduceRegisterPressure() const override {
+ return true;
+ }
+
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
EVT VT) const override {
if (VT.isVector())
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 30e415ebe9527..a2b4c444e920f 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -8,14 +8,11 @@ define i32 @trunc(i64 %a, i64 %b) {
; CHECK-LABEL: trunc(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [trunc_param_0];
-; CHECK-NEXT: ld.param.u64 %rd2, [trunc_param_1];
-; CHECK-NEXT: cvt.u32.u64 %r1, %rd2;
-; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT: or.b32 %r3, %r2, %r1;
+; CHECK-NEXT: ld.param.u32 %r1, [trunc_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [trunc_param_1];
+; CHECK-NEXT: or.b32 %r3, %r1, %r2;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
; CHECK-NEXT: ret;
%or = or i64 %a, %b
@@ -48,15 +45,12 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
; CHECK-LABEL: trunc_cvt(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_param_0];
-; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_param_1];
-; CHECK-NEXT: cvt.u32.u64 %r1, %rd2;
-; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT: add.s32 %r3, %r2, %r1;
-; CHECK-NEXT: or.b32 %r4, %r3, %r2;
+; CHECK-NEXT: ld.param.u32 %r1, [trunc_cvt_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [trunc_cvt_param_1];
+; CHECK-NEXT: add.s32 %r3, %r1, %r2;
+; CHECK-NEXT: or.b32 %r4, %r3, %r1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
; CHECK-NEXT: ret;
%add = add i64 %a, %b
>From dc79b8fc72efce087b5351e7555010637258dd2e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 14:01:32 -0700
Subject: [PATCH 11/32] Update variadics-backend.ll
---
llvm/test/CodeGen/NVPTX/variadics-backend.ll | 65 +++++++++-----------
1 file changed, 30 insertions(+), 35 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 0e0c89d3e0214..59619fa009d1e 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -11,8 +11,8 @@
define dso_local i32 @variadics1(i32 noundef %first, ...) {
; CHECK-PTX-LABEL: variadics1(
; CHECK-PTX: {
-; CHECK-PTX-NEXT: .reg .b32 %r<11>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT: .reg .b32 %r<12>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
; CHECK-PTX-NEXT: .reg .f64 %fd<7>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
@@ -26,23 +26,21 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) {
; CHECK-PTX-NEXT: add.s32 %r7, %r5, %r6;
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 19;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3];
-; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r7;
-; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4;
-; CHECK-PTX-NEXT: cvt.u32.u64 %r8, %rd6;
-; CHECK-PTX-NEXT: add.s64 %rd7, %rd3, 15;
-; CHECK-PTX-NEXT: and.b64 %rd8, %rd7, -8;
-; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd8];
-; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r8;
+; CHECK-PTX-NEXT: ld.u32 %r8, [%rd3];
+; CHECK-PTX-NEXT: add.s32 %r9, %r7, %r8;
+; CHECK-PTX-NEXT: add.s64 %rd4, %rd3, 15;
+; CHECK-PTX-NEXT: and.b64 %rd5, %rd4, -8;
+; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd5];
+; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r9;
; CHECK-PTX-NEXT: add.rn.f64 %fd3, %fd2, %fd1;
-; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r9, %fd3;
-; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, 15;
-; CHECK-PTX-NEXT: and.b64 %rd10, %rd9, -8;
-; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd10];
-; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r9;
+; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd3;
+; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, 15;
+; CHECK-PTX-NEXT: and.b64 %rd7, %rd6, -8;
+; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd7];
+; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r10;
; CHECK-PTX-NEXT: add.rn.f64 %fd6, %fd5, %fd4;
-; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd6;
-; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r10;
+; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r11, %fd6;
+; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r11;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
@@ -152,8 +150,8 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<6>;
-; CHECK-PTX-NEXT: .reg .b32 %r<7>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT: .reg .b32 %r<8>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot2;
@@ -175,13 +173,11 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
; CHECK-PTX-NEXT: shl.b16 %rs4, %rs3, 8;
; CHECK-PTX-NEXT: or.b16 %rs5, %rs4, %rs2;
; CHECK-PTX-NEXT: st.u16 [%SP+0], %rs5;
-; CHECK-PTX-NEXT: ld.u64 %rd8, [%rd3+8];
-; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2;
-; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT: cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT: add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd10;
-; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r6;
+; CHECK-PTX-NEXT: ld.u32 %r4, [%rd3+8];
+; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2;
+; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3;
+; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4;
+; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r7;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
@@ -347,20 +343,19 @@ entry:
define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) {
; CHECK-PTX-LABEL: variadics4(
; CHECK-PTX: {
-; CHECK-PTX-NEXT: .reg .b32 %r<2>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<9>;
+; CHECK-PTX-NEXT: .reg .b32 %r<6>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics4_param_1];
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3];
-; CHECK-PTX-NEXT: ld.param.u64 %rd5, [variadics4_param_0];
-; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0+8];
-; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6;
-; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4;
-; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd8;
-; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-PTX-NEXT: ld.u32 %r1, [%rd3];
+; CHECK-PTX-NEXT: ld.param.u32 %r2, [variadics4_param_0];
+; CHECK-PTX-NEXT: ld.param.u32 %r3, [variadics4_param_0+8];
+; CHECK-PTX-NEXT: add.s32 %r4, %r2, %r3;
+; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r1;
+; CHECK-PTX-NEXT: st.param.b32 [func_retval0+0], %r5;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
>From 680ac60d4e67194e6ba9c2ee8ed14fbe2b49cdc9 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 17:49:33 -0700
Subject: [PATCH 12/32] Save work
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 ++++++++++++++++++++-
1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9429d95a46c5d..8581a775862ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::VSELECT});
+ ISD::VSELECT, ISD::SELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5999,6 +5999,32 @@ static SDValue PerformLOADCombine(SDNode *N,
DL);
}
+// This transformations was once reliably performed by instcombine, but thanks
+// to poison semantics they are no longer safe for LLVM IR, perform them here
+// instead.
+static SDValue PerformSELECTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ return SDValue();
+ if (!(N->getValueType(0) == MVT::i1))
+ return SDValue();
+
+ unsigned Opcode = N->getOpcode();
+ SDValue SecondOperand;
+ if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(2)); Const && Const->isZero()) {
+ // (select cond, x, false) -> (and cond, x)
+ Opcode = ISD::AND;
+ SecondOperand = N->getOperand(1);
+ } else if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1)); Const && Const->isOne()) {
+ // (select cond, true, x) -> (or cond, x)
+ Opcode = ISD::OR;
+ SecondOperand = N->getOperand(2);
+ } else {
+ return SDValue();
+ }
+
+ return DCI.DAG.getNode(Opcode, SDLoc(N), MVT::i1, N->getOperand(0), SecondOperand);
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6021,6 +6047,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
return PerformLOADCombine(N, DCI);
+ case ISD::SELECT:
+ return PerformSELECTCombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
>From 7fb23e555f4b3764df64005978b91dc0e8fd787a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 17:53:12 -0700
Subject: [PATCH 13/32] Modify boolean-patterns.ll
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 6 ------
llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 6 ++----
2 files changed, 2 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f37822f764bed..85c1db4182113 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1547,12 +1547,6 @@ defm XOR : BITWISE<"xor", xor>;
def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
-// These transformations were once reliably performed by instcombine, but thanks
-// to poison semantics they are no longer safe for LLVM IR, perform them here
-// instead.
-def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
-
// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
foreach vt = [v2i16, v4i8] in {
def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index d38880599d1e6..d0b9376368239 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -18,16 +18,14 @@ define i1 @m2and_ri(i1 %a) {
; CHECK-LABEL: select2or
define i1 @select2or(i1 %a, i1 %b) {
-; CHECK: or.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK-NOT: selp
+; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
%r = select i1 %a, i1 1, i1 %b
ret i1 %r
}
; CHECK-LABEL: select2and
define i1 @select2and(i1 %a, i1 %b) {
-; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK-NOT: selp
+; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
%r = select i1 %a, i1 %b, i1 0
ret i1 %r
}
>From 96f17d39827042479e7e805157e34922d1901876 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 18:51:45 -0700
Subject: [PATCH 14/32] Revert "Save work"
This reverts commit 680ac60d4e67194e6ba9c2ee8ed14fbe2b49cdc9.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 +--------------------
1 file changed, 1 insertion(+), 29 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8581a775862ce..9429d95a46c5d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
- ISD::VSELECT, ISD::SELECT});
+ ISD::VSELECT});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5999,32 +5999,6 @@ static SDValue PerformLOADCombine(SDNode *N,
DL);
}
-// This transformations was once reliably performed by instcombine, but thanks
-// to poison semantics they are no longer safe for LLVM IR, perform them here
-// instead.
-static SDValue PerformSELECTCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
- return SDValue();
- if (!(N->getValueType(0) == MVT::i1))
- return SDValue();
-
- unsigned Opcode = N->getOpcode();
- SDValue SecondOperand;
- if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(2)); Const && Const->isZero()) {
- // (select cond, x, false) -> (and cond, x)
- Opcode = ISD::AND;
- SecondOperand = N->getOperand(1);
- } else if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1)); Const && Const->isOne()) {
- // (select cond, true, x) -> (or cond, x)
- Opcode = ISD::OR;
- SecondOperand = N->getOperand(2);
- } else {
- return SDValue();
- }
-
- return DCI.DAG.getNode(Opcode, SDLoc(N), MVT::i1, N->getOperand(0), SecondOperand);
-}
-
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6047,8 +6021,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformSETCCCombine(N, DCI, STI.getSmVersion());
case ISD::LOAD:
return PerformLOADCombine(N, DCI);
- case ISD::SELECT:
- return PerformSELECTCombine(N, DCI);
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
>From 6cb1f01f9492cae976953d12129c4da93cba37b5 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:37:24 -0700
Subject: [PATCH 15/32] Save work
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 30126d51bf706..3f49c8862003d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15199,6 +15199,17 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
break;
}
+ switch (N0.getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ // if ()
+ break;
+ }
+
return SDValue();
}
>From 71b3718196993e12f81674f5e0806c6b535273cb Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:38:25 -0700
Subject: [PATCH 16/32] Remove improper TLI use
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3f49c8862003d..06b0f5b467fd7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15155,8 +15155,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::OR:
case ISD::XOR:
if (!LegalOperations && N0.hasOneUse() &&
- (TLI.shouldReduceRegisterPressure() ||
- isConstantOrConstantVector(N0.getOperand(0), true) ||
+ (isConstantOrConstantVector(N0.getOperand(0), true) ||
isConstantOrConstantVector(N0.getOperand(1), true))) {
// TODO: We already restricted this to pre-legalization, but for vectors
// we are extra cautious to not create an unsupported operation.
>From 6064c42bab8c415711316d8dff08f1951326354c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:39:53 -0700
Subject: [PATCH 17/32] Remove white space
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9429d95a46c5d..a2181b478c269 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6034,7 +6034,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VSELECT:
return PerformVSELECTCombine(N, DCI);
}
-
return SDValue();
}
>From 96551dd7f42082dd19c763de1e2a612363b7d94b Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:40:27 -0700
Subject: [PATCH 18/32] clang-format
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 4 +---
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 06b0f5b467fd7..5dfca48dd2879 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15206,7 +15206,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::OR:
case ISD::XOR:
// if ()
- break;
+ break;
}
return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 43891369424de..6b3a9b5fffe95 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -502,9 +502,7 @@ class NVPTXTargetLowering : public TargetLowering {
DstTy->getPrimitiveSizeInBits() == 32;
}
- bool shouldReduceRegisterPressure() const override {
- return true;
- }
+ bool shouldReduceRegisterPressure() const override { return true; }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
EVT VT) const override {
>From 6a3b3b9776b2d6caffc7b307ed5f118820a59214 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 11:41:20 -0700
Subject: [PATCH 19/32] Implement transform in DAG combine
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 ++++++++++++-------
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 6 +++++
2 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5dfca48dd2879..c506d99634f66 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15198,15 +15198,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
break;
}
- switch (N0.getOpcode()) {
- case ISD::ADD:
- case ISD::SUB:
- case ISD::MUL:
- case ISD::AND:
- case ISD::OR:
- case ISD::XOR:
- // if ()
- break;
+ if (TLI.shouldReduceRegisterPressure()) {
+ switch (N0.getOpcode()) {
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ if (!(N0.hasOneUse() && VT.isScalarInteger()))
+ break;
+ if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
+ break;
+ SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+ SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+ return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+ }
}
return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 85c1db4182113..f37822f764bed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1547,6 +1547,12 @@ defm XOR : BITWISE<"xor", xor>;
def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
+// These transformations were once reliably performed by instcombine, but thanks
+// to poison semantics they are no longer safe for LLVM IR, perform them here
+// instead.
+def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
// Lower logical v2i16/v4i8 ops as bitwise ops on b32.
foreach vt = [v2i16, v4i8] in {
def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
>From 4448e33339e2573e4e427a4906cb9aa8535525e9 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:36:45 -0700
Subject: [PATCH 20/32] Save work
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 6b3a9b5fffe95..18f50b0597e34 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -496,12 +496,20 @@ class NVPTXTargetLowering : public TargetLowering {
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
// Truncating 64-bit to 32-bit is free in SASS.
- if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+ if (!(SrcTy->isIntegerTy() && DstTy->isIntegerTy()))
return false;
return SrcTy->getPrimitiveSizeInBits() == 64 &&
DstTy->getPrimitiveSizeInBits() == 32;
}
+ bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+ // Truncating 64-bit to 32-bit is free in SASS.
+ if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+ return false;
+ return FromVT.getFixedSizeInBits() == 64 &&
+ ToVT.getFixedSizeInBits() == 32;
+ }
+
bool shouldReduceRegisterPressure() const override { return true; }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
>From c92f77fa8d49f6888a254ea803069dca914b5450 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:37:01 -0700
Subject: [PATCH 21/32] Remove isTruncateFree
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 --------
1 file changed, 8 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 18f50b0597e34..c61fedbe2ddfa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -502,14 +502,6 @@ class NVPTXTargetLowering : public TargetLowering {
DstTy->getPrimitiveSizeInBits() == 32;
}
- bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
- // Truncating 64-bit to 32-bit is free in SASS.
- if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
- return false;
- return FromVT.getFixedSizeInBits() == 64 &&
- ToVT.getFixedSizeInBits() == 32;
- }
-
bool shouldReduceRegisterPressure() const override { return true; }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
>From f39a2f7d3dd913740f20fe27cc4541d98209d104 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:45:56 -0700
Subject: [PATCH 22/32] Use ptr in test
---
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index a2b4c444e920f..221b89c7812e7 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -59,7 +59,7 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
ret i32 %trunc
}
-define i32 @trunc_cvt_not(i64 %a, i64 %b) {
+define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) {
; CHECK-LABEL: trunc_cvt_not(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<4>;
@@ -69,7 +69,7 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b) {
; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_not_param_0];
; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_not_param_1];
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
-; CHECK-NEXT: mov.u64 %rd4, 0;
+; CHECK-NEXT: ld.param.u64 %rd4, [trunc_cvt_not_param_2];
; CHECK-NEXT: st.u64 [%rd4], %rd3;
; CHECK-NEXT: cvt.u32.u64 %r1, %rd3;
; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
@@ -77,7 +77,7 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b) {
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
; CHECK-NEXT: ret;
%add = add i64 %a, %b
- store i64 %add, ptr null
+ store i64 %add, ptr %p
%or = or i64 %add, %a
%trunc = trunc i64 %or to i32
ret i32 %trunc
>From d77c79918e03e345b4f345d1e056b267885f57b7 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:55:31 -0700
Subject: [PATCH 23/32] Only run on free truncs
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 ++++++++
llvm/test/CodeGen/NVPTX/combine-truncate.ll | 16 ++++++++++++++++
3 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c506d99634f66..2b05f75e07f85 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15206,7 +15206,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- if (!(N0.hasOneUse() && VT.isScalarInteger()))
+ if (!(N0.hasOneUse() && VT.isScalarInteger() && TLI.isTruncateFree(SrcVT, VT)))
break;
if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index c61fedbe2ddfa..18f50b0597e34 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -502,6 +502,14 @@ class NVPTXTargetLowering : public TargetLowering {
DstTy->getPrimitiveSizeInBits() == 32;
}
+ bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+ // Truncating 64-bit to 32-bit is free in SASS.
+ if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+ return false;
+ return FromVT.getFixedSizeInBits() == 64 &&
+ ToVT.getFixedSizeInBits() == 32;
+ }
+
bool shouldReduceRegisterPressure() const override { return true; }
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 221b89c7812e7..1c1c091e39bb4 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -82,3 +82,19 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) {
%trunc = trunc i64 %or to i32
ret i32 %trunc
}
+
+define i16 @trunc_i32_to_i16_not(i32 %a, i32 %b) {
+; CHECK-LABEL: trunc_i32_to_i16_not(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %r1, [trunc_i32_to_i16_not_param_0];
+; CHECK-NEXT: ld.param.u16 %r2, [trunc_i32_to_i16_not_param_1];
+; CHECK-NEXT: or.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT: ret;
+ %or = or i32 %a, %b
+ %trunc = trunc i32 %or to i16
+ ret i16 %trunc
+}
>From b5905aad0b27fc680edfbd357eb11b21ef62f2f0 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:04:56 -0700
Subject: [PATCH 24/32] Add comment
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2b05f75e07f85..9f66e36554f5a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5797,6 +5797,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
}
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
+ //
+ // For targets that are particulaly senesitve to register pressure it's preferable to
+ // increase the number of truncate instructions in order to decrease the bit
+ // width of the logic_op.
if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
>From 147da4311bb286d63a810634d319f7668b78586e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:05:32 -0700
Subject: [PATCH 25/32] clang-format
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++++-----
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 3 +--
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9f66e36554f5a..d4e1cd758ce49 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5797,10 +5797,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
}
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
- //
- // For targets that are particulaly senesitve to register pressure it's preferable to
- // increase the number of truncate instructions in order to decrease the bit
- // width of the logic_op.
+ //
+ // For targets that are particulaly senesitve to register pressure it's
+ // preferable to increase the number of truncate instructions in order to
+ // decrease the bit width of the logic_op.
if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
@@ -15210,7 +15210,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- if (!(N0.hasOneUse() && VT.isScalarInteger() && TLI.isTruncateFree(SrcVT, VT)))
+ if (!(N0.hasOneUse() && VT.isScalarInteger() &&
+ TLI.isTruncateFree(SrcVT, VT)))
break;
if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 18f50b0597e34..a400843cc3ad6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,8 +506,7 @@ class NVPTXTargetLowering : public TargetLowering {
// Truncating 64-bit to 32-bit is free in SASS.
if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
return false;
- return FromVT.getFixedSizeInBits() == 64 &&
- ToVT.getFixedSizeInBits() == 32;
+ return FromVT.getFixedSizeInBits() == 64 && ToVT.getFixedSizeInBits() == 32;
}
bool shouldReduceRegisterPressure() const override { return true; }
>From f29c7a58b52d9522e82e51ff13d71c02beec6359 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:06:47 -0700
Subject: [PATCH 26/32] explcitly mentino GPU in comment
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d4e1cd758ce49..c6f490c77d7df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5798,7 +5798,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
//
- // For targets that are particulaly senesitve to register pressure it's
+ // For targets that are particulaly senesitve to register pressure (e.g. GPUs) it's
// preferable to increase the number of truncate instructions in order to
// decrease the bit width of the logic_op.
if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
>From 12418d6674c9ec83dc75c6446d79d170671a2cf1 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:06:59 -0700
Subject: [PATCH 27/32] clang-format
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c6f490c77d7df..a4712a24d1a0b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5798,8 +5798,8 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
//
- // For targets that are particulaly senesitve to register pressure (e.g. GPUs) it's
- // preferable to increase the number of truncate instructions in order to
+ // For targets that are particulaly senesitve to register pressure (e.g. GPUs)
+ // it's preferable to increase the number of truncate instructions in order to
// decrease the bit width of the logic_op.
if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
// If both operands have other uses, this transform would create extra
>From 45de2d60ac3c58c143fa3ec9d83eff1465a1631c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:11:56 -0700
Subject: [PATCH 28/32] Add comment to TLI function
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 49e25d2df6600..a777c292148a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2975,6 +2975,8 @@ class TargetLoweringBase {
return isTruncateFree(Val.getValueType(), VT2);
}
+ // Return true if the target will accepts tradeoffs (e.g. increase the number
+ // of instructions) to reduce register pressure.
virtual bool shouldReduceRegisterPressure() const { return false; }
virtual bool isProfitableToHoist(Instruction *I) const { return true; }
>From 09515c4c649bf183a199b6fc71a349a3780f625e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:12:11 -0700
Subject: [PATCH 29/32] clang-format
---
llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a777c292148a5..8f5975434688d 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2975,7 +2975,7 @@ class TargetLoweringBase {
return isTruncateFree(Val.getValueType(), VT2);
}
- // Return true if the target will accepts tradeoffs (e.g. increase the number
+ // Return true if the target will accepts tradeoffs (e.g. increase the number
// of instructions) to reduce register pressure.
virtual bool shouldReduceRegisterPressure() const { return false; }
>From 7b801c19a0ac21b07883ee58ba550865a4e25075 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:41:32 -0700
Subject: [PATCH 30/32] Update boolean-patterns.ll
---
llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 72 ++++++++++++++++++---
1 file changed, 63 insertions(+), 9 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index d0b9376368239..de0437cb96e39 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -1,31 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; CHECK-LABEL: m2and_rr
+target triple = "nvptx64-nvidia-cuda"
+
define i1 @m2and_rr(i1 %a, i1 %b) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
-; CHECK-NOT: mul
+; CHECK-LABEL: m2and_rr(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [m2and_rr_param_1];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: ld.param.u8 %rs3, [m2and_rr_param_0];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT: and.pred %p3, %p2, %p1;
+; CHECK-NEXT: selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
%r = mul i1 %a, %b
ret i1 %r
}
-; CHECK-LABEL: m2and_ri
define i1 @m2and_ri(i1 %a) {
-; CHECK-NOT: mul
+; CHECK-LABEL: m2and_ri(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %r1, [m2and_ri_param_0];
+; CHECK-NEXT: and.b32 %r2, %r1, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
%r = mul i1 %a, 1
ret i1 %r
}
-; CHECK-LABEL: select2or
define i1 @select2or(i1 %a, i1 %b) {
-; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
+; CHECK-LABEL: select2or(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [select2or_param_1];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: ld.param.u8 %rs3, [select2or_param_0];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT: or.pred %p3, %p2, %p1;
+; CHECK-NEXT: selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
%r = select i1 %a, i1 1, i1 %b
ret i1 %r
}
-; CHECK-LABEL: select2and
define i1 @select2and(i1 %a, i1 %b) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
+; CHECK-LABEL: select2and(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<4>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [select2and_param_1];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: ld.param.u8 %rs3, [select2and_param_0];
+; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT: setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT: and.pred %p3, %p2, %p1;
+; CHECK-NEXT: selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
%r = select i1 %a, i1 %b, i1 0
ret i1 %r
}
>From c80e6cdaf48505170441c3d4f0383d03557223d5 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 23 Jul 2024 16:12:00 -0700
Subject: [PATCH 31/32] Use isNarrowingProfitable instead of
shouldReduceRegisterPressure
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 --
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++-----
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 ++--
llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 40 ++++++++-----------
llvm/test/CodeGen/X86/apx/and.ll | 6 +--
...-merge-scalar-constmask-interleavedbits.ll | 16 ++++----
llvm/test/CodeGen/X86/vec_saddo.ll | 8 +++-
llvm/test/CodeGen/X86/vec_uaddo.ll | 8 +++-
8 files changed, 50 insertions(+), 58 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8f5975434688d..55b60b01e5827 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2975,10 +2975,6 @@ class TargetLoweringBase {
return isTruncateFree(Val.getValueType(), VT2);
}
- // Return true if the target will accepts tradeoffs (e.g. increase the number
- // of instructions) to reduce register pressure.
- virtual bool shouldReduceRegisterPressure() const { return false; }
-
virtual bool isProfitableToHoist(Instruction *I) const { return true; }
/// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a4712a24d1a0b..5476c2450089c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5797,11 +5797,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
}
// logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
- //
- // For targets that are particulaly senesitve to register pressure (e.g. GPUs)
- // it's preferable to increase the number of truncate instructions in order to
- // decrease the bit width of the logic_op.
- if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
+ if (HandOpcode == ISD::TRUNCATE) {
// If both operands have other uses, this transform would create extra
// instructions without eliminating anything.
if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -5813,9 +5809,14 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
return SDValue();
// Be extra careful sinking truncate. If it's free, there's no benefit in
- // widening a binop. Also, don't create a logic op on an illegal type.
+ // widening a binop.
if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
return SDValue();
+ // Prevent an infinite loop if the target preferts the inverse
+ // transformation.
+ if (TLI.isNarrowingProfitable(XVT, VT))
+ return SDValue();
+ // Don't create a logic op on an illegal type.
if (!TLI.isTypeLegal(XVT))
return SDValue();
SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
@@ -15202,7 +15203,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
break;
}
- if (TLI.shouldReduceRegisterPressure()) {
+ if (TLI.isNarrowingProfitable(SrcVT, VT)) {
switch (N0.getOpcode()) {
case ISD::ADD:
case ISD::SUB:
@@ -15210,8 +15211,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- if (!(N0.hasOneUse() && VT.isScalarInteger() &&
- TLI.isTruncateFree(SrcVT, VT)))
+ if (!(N0.hasOneUse() && VT.isScalarInteger()))
break;
if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index a400843cc3ad6..c79b3729b9846 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -502,15 +502,13 @@ class NVPTXTargetLowering : public TargetLowering {
DstTy->getPrimitiveSizeInBits() == 32;
}
- bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+ bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override {
// Truncating 64-bit to 32-bit is free in SASS.
- if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+ if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
return false;
- return FromVT.getFixedSizeInBits() == 64 && ToVT.getFixedSizeInBits() == 32;
+ return SrcVT.getFixedSizeInBits() == 64 && DestVT.getFixedSizeInBits() == 32;
}
- bool shouldReduceRegisterPressure() const override { return true; }
-
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
EVT VT) const override {
if (VT.isVector())
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index de0437cb96e39..c30ebccdf9199 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -43,20 +43,16 @@ define i1 @m2and_ri(i1 %a) {
define i1 @select2or(i1 %a, i1 %b) {
; CHECK-LABEL: select2or(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<4>;
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u8 %rs1, [select2or_param_1];
-; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT: ld.param.u8 %rs3, [select2or_param_0];
-; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT: setp.eq.b16 %p2, %rs4, 1;
-; CHECK-NEXT: or.pred %p3, %p2, %p1;
-; CHECK-NEXT: selp.u32 %r1, 1, 0, %p3;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ld.param.u8 %rs1, [select2or_param_0];
+; CHECK-NEXT: ld.param.u8 %rs2, [select2or_param_1];
+; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT: and.b32 %r2, %r1, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
; CHECK-NEXT: ret;
%r = select i1 %a, i1 1, i1 %b
ret i1 %r
@@ -65,20 +61,16 @@ define i1 @select2or(i1 %a, i1 %b) {
define i1 @select2and(i1 %a, i1 %b) {
; CHECK-LABEL: select2and(
; CHECK: {
-; CHECK-NEXT: .reg .pred %p<4>;
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b16 %rs<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.u8 %rs1, [select2and_param_1];
-; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT: ld.param.u8 %rs3, [select2and_param_0];
-; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT: setp.eq.b16 %p2, %rs4, 1;
-; CHECK-NEXT: and.pred %p3, %p2, %p1;
-; CHECK-NEXT: selp.u32 %r1, 1, 0, %p3;
-; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ld.param.u8 %rs1, [select2and_param_0];
+; CHECK-NEXT: ld.param.u8 %rs2, [select2and_param_1];
+; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT: and.b32 %r2, %r1, 1;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
; CHECK-NEXT: ret;
%r = select i1 %a, i1 %b, i1 0
ret i1 %r
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index 51858ad591605..130d1c2ead5ed 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -5,14 +5,12 @@
define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
; CHECK-LABEL: and8rr:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: andb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0xf7]
; CHECK-NEXT: retq # encoding: [0xc3]
;
; NF-LABEL: and8rr:
; NF: # %bb.0: # %entry
-; NF-NEXT: {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
-; NF-NEXT: # kill: def $al killed $al killed $eax
+; NF-NEXT: {nf} andb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0xf7]
; NF-NEXT: retq # encoding: [0xc3]
entry:
%and = and i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
index c4c4e5ed1fdde..2e2c152c5506a 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
define i8 @in8_constmask(i8 %x, i8 %y) {
; CHECK-NOBMI-LABEL: in8_constmask:
; CHECK-NOBMI: # %bb.0:
-; CHECK-NOBMI-NEXT: movl %esi, %eax
-; CHECK-NOBMI-NEXT: xorl %esi, %edi
-; CHECK-NOBMI-NEXT: andb $85, %dil
-; CHECK-NOBMI-NEXT: xorb %dil, %al
+; CHECK-NOBMI-NEXT: movl %edi, %eax
+; CHECK-NOBMI-NEXT: xorb %sil, %al
+; CHECK-NOBMI-NEXT: andb $85, %al
+; CHECK-NOBMI-NEXT: xorb %sil, %al
; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NOBMI-NEXT: retq
;
; CHECK-BMI-LABEL: in8_constmask:
; CHECK-BMI: # %bb.0:
-; CHECK-BMI-NEXT: movl %esi, %eax
-; CHECK-BMI-NEXT: xorl %esi, %edi
-; CHECK-BMI-NEXT: andb $85, %dil
-; CHECK-BMI-NEXT: xorb %dil, %al
+; CHECK-BMI-NEXT: movl %edi, %eax
+; CHECK-BMI-NEXT: xorb %sil, %al
+; CHECK-BMI-NEXT: andb $85, %al
+; CHECK-BMI-NEXT: xorb %sil, %al
; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax
; CHECK-BMI-NEXT: retq
%n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe11f82a..0aea89e4acf01 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,12 +1045,16 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: saddo_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm2
; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm2
+; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index bac118095331c..34e2811cb9bb5 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,12 +1098,16 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
;
; AVX512-LABEL: uaddo_v4i1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm2
; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT: vpslld $31, %xmm1, %xmm2
+; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $12, %k0, %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: retq
>From 98cc5439361abc9188383d19b582d1bd22e0cb4d Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 23 Jul 2024 16:13:40 -0700
Subject: [PATCH 32/32] clang-format
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5476c2450089c..d5d990762a490 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5812,7 +5812,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
// widening a binop.
if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
return SDValue();
- // Prevent an infinite loop if the target preferts the inverse
+ // Prevent an infinite loop if the target preferts the inverse
// transformation.
if (TLI.isNarrowingProfitable(XVT, VT))
return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index c79b3729b9846..0f57a902db507 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,7 +506,8 @@ class NVPTXTargetLowering : public TargetLowering {
// Truncating 64-bit to 32-bit is free in SASS.
if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
return false;
- return SrcVT.getFixedSizeInBits() == 64 && DestVT.getFixedSizeInBits() == 32;
+ return SrcVT.getFixedSizeInBits() == 64 &&
+ DestVT.getFixedSizeInBits() == 32;
}
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
More information about the llvm-commits
mailing list