[llvm] [NVPTX] Custom lower ADDRSPACECAST (PR #125607)
Justin Fargnoli via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 7 09:20:22 PST 2025
https://github.com/justinfargnoli updated https://github.com/llvm/llvm-project/pull/125607
>From a924b645859569f60258b23fb75df42af46d7b1c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 2 Feb 2025 19:40:45 -0800
Subject: [PATCH 1/4] [NVPTX] Custom lower ADDRSPACECAST
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 21 ++++++++++++++++++++-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 1 +
llvm/test/CodeGen/NVPTX/addrspacecast.ll | 14 ++++++++++++++
3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9e7e1dbcea25d11..1b1f74d3412e41c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -989,6 +989,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand);
}
+ setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
+
// No FPOW or FREM in PTX.
// Now deduce the information based on the above mentioned
@@ -2652,6 +2654,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
case ISD::FRAMEADDR:
return SDValue();
+ case ISD::ADDRSPACECAST:
+ return LowerADDRSPACECAST(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
case ISD::INTRINSIC_W_CHAIN:
@@ -2726,7 +2730,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
// Used only for bf16 on SM80, where we select fma for non-ftz operation
return PromoteBinOpIfF32FTZ(Op, DAG);
-
default:
llvm_unreachable("Custom lowering not defined for operation");
}
@@ -2767,6 +2770,22 @@ unsigned NVPTXTargetLowering::getJumpTableEncoding() const {
return MachineJumpTableInfo::EK_Inline;
}
+SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
+
+ EVT OperandVT = Op.getOperand(0).getValueType();
+ unsigned SrcAS = N->getSrcAddressSpace();
+ EVT ResultVT = Op.getValueType();
+ unsigned DestAS = N->getDestAddressSpace();
+
+ if (SrcAS == llvm::ADDRESS_SPACE_GENERIC || DestAS == llvm::ADDRESS_SPACE_GENERIC)
+ return Op;
+
+ SDValue ToGeneric = DAG.getAddrSpaceCast(DL, OperandVT, Op.getOperand(0), SrcAS, llvm::ADDRESS_SPACE_GENERIC);
+ return DAG.getAddrSpaceCast(DL, ResultVT, ToGeneric, llvm::ADDRESS_SPACE_GENERIC, DestAS);
+}
+
// This function is almost a copy of SelectionDAG::expandVAArg().
// The only diff is that this one produces loads from local address space.
SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5adf69d621552f3..74ec14ba5f8e320 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -264,6 +264,7 @@ class NVPTXTargetLowering : public TargetLowering {
const NVPTXSubtarget &STI; // cache the subtarget here
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
+ SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index 23428b3728674eb..e3ebb2f458d46a2 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -99,6 +99,20 @@ define i32 @conv8(ptr %ptr) {
ret i32 %val
}
+; ALL-LABEL: conv9
+define i32 @conv9(ptr addrspace(1) %ptr) {
+; CLS32: cvta.global.u32
+; CLS32: cvta.to.shared.u32
+; CLS64: cvta.global.u64
+; CLS64: cvta.to.shared.u64
+; PTRCONV: cvt.u32.u64
+; NOPTRCONV-NOT: cvt.u32.u64
+; ALL: ld.shared.u32
+ %specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3)
+ %val = load i32, ptr addrspace(3) %specptr
+ ret i32 %val
+}
+
; Check that we support addrspacecast when splitting the vector
; result (<2 x ptr> => 2 x <1 x ptr>).
; This also checks that scalarization works for addrspacecast
>From a3be75c30f87a0d5b8638b4f0bd228f68eb86eed Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 2 Feb 2025 19:43:20 -0800
Subject: [PATCH 2/4] Fix whitespace
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1b1f74d3412e41c..0f92c3579539600 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2730,6 +2730,7 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
// Used only for bf16 on SM80, where we select fma for non-ftz operation
return PromoteBinOpIfF32FTZ(Op, DAG);
+
default:
llvm_unreachable("Custom lowering not defined for operation");
}
>From 9d3df673f90fae1f5c6faebe0b423f86b589c683 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 2 Feb 2025 19:43:50 -0800
Subject: [PATCH 3/4] clang-format
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 0f92c3579539600..e5071c67560c806 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2771,7 +2771,8 @@ unsigned NVPTXTargetLowering::getJumpTableEncoding() const {
return MachineJumpTableInfo::EK_Inline;
}
-SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const {
+SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
@@ -2780,11 +2781,14 @@ SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) c
EVT ResultVT = Op.getValueType();
unsigned DestAS = N->getDestAddressSpace();
- if (SrcAS == llvm::ADDRESS_SPACE_GENERIC || DestAS == llvm::ADDRESS_SPACE_GENERIC)
+ if (SrcAS == llvm::ADDRESS_SPACE_GENERIC ||
+ DestAS == llvm::ADDRESS_SPACE_GENERIC)
return Op;
- SDValue ToGeneric = DAG.getAddrSpaceCast(DL, OperandVT, Op.getOperand(0), SrcAS, llvm::ADDRESS_SPACE_GENERIC);
- return DAG.getAddrSpaceCast(DL, ResultVT, ToGeneric, llvm::ADDRESS_SPACE_GENERIC, DestAS);
+ SDValue ToGeneric = DAG.getAddrSpaceCast(DL, OperandVT, Op.getOperand(0),
+ SrcAS, llvm::ADDRESS_SPACE_GENERIC);
+ return DAG.getAddrSpaceCast(DL, ResultVT, ToGeneric,
+ llvm::ADDRESS_SPACE_GENERIC, DestAS);
}
// This function is almost a copy of SelectionDAG::expandVAArg().
>From 6808e7d92e24e38c12a6125da98ad4060d4cbf45 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 7 Feb 2025 09:19:57 -0800
Subject: [PATCH 4/4] Return UNDEF on invalid addrspacecast
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 17 ++++-------------
llvm/test/CodeGen/NVPTX/addrspacecast.ll | 21 +++++++++------------
2 files changed, 13 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index e5071c67560c806..44f62e213c9d3e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2773,22 +2773,13 @@ unsigned NVPTXTargetLowering::getJumpTableEncoding() const {
SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
SelectionDAG &DAG) const {
- SDLoc DL(Op);
AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
-
- EVT OperandVT = Op.getOperand(0).getValueType();
unsigned SrcAS = N->getSrcAddressSpace();
- EVT ResultVT = Op.getValueType();
unsigned DestAS = N->getDestAddressSpace();
-
- if (SrcAS == llvm::ADDRESS_SPACE_GENERIC ||
- DestAS == llvm::ADDRESS_SPACE_GENERIC)
- return Op;
-
- SDValue ToGeneric = DAG.getAddrSpaceCast(DL, OperandVT, Op.getOperand(0),
- SrcAS, llvm::ADDRESS_SPACE_GENERIC);
- return DAG.getAddrSpaceCast(DL, ResultVT, ToGeneric,
- llvm::ADDRESS_SPACE_GENERIC, DestAS);
+ if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
+ DestAS != llvm::ADDRESS_SPACE_GENERIC)
+ return DAG.getUNDEF(Op.getValueType());
+ return Op;
}
// This function is almost a copy of SelectionDAG::expandVAArg().
diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
index e3ebb2f458d46a2..0aa66d1fc45f3ab 100644
--- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll
@@ -1,15 +1,15 @@
-; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32,G32
-; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64,G64
-; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr| FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64,G64
+; RUN: llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,CLS32
+; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64
+; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64
; RUN: %if ptxas && !ptxas-12.0 %{ llc -O0 < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_20 --nvptx-short-ptr | %ptxas-verify %}
; ALL-LABEL: conv1
define i32 @conv1(ptr addrspace(1) %ptr) {
-; G32: cvta.global.u32
+; CLS32: cvta.global.u32
; ALL-NOT: cvt.u64.u32
-; G64: cvta.global.u64
+; CLS64: cvta.global.u64
; ALL: ld.u32
%genptr = addrspacecast ptr addrspace(1) %ptr to ptr
%val = load i32, ptr %genptr
@@ -101,13 +101,10 @@ define i32 @conv8(ptr %ptr) {
; ALL-LABEL: conv9
define i32 @conv9(ptr addrspace(1) %ptr) {
-; CLS32: cvta.global.u32
-; CLS32: cvta.to.shared.u32
-; CLS64: cvta.global.u64
-; CLS64: cvta.to.shared.u64
-; PTRCONV: cvt.u32.u64
-; NOPTRCONV-NOT: cvt.u32.u64
-; ALL: ld.shared.u32
+; CLS32: // implicit-def: %[[ADDR:r[0-9]+]]
+; PTRCONV: // implicit-def: %[[ADDR:r[0-9]+]]
+; NOPTRCONV: // implicit-def: %[[ADDR:rd[0-9]+]]
+; ALL: ld.shared.u32 %r{{[0-9]+}}, [%[[ADDR]]]
%specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3)
%val = load i32, ptr addrspace(3) %specptr
ret i32 %val
More information about the llvm-commits
mailing list