[PATCH] D75817: [NVPTX] Fix instruction selection for addresses in case of addrspacecasts
Thomas Faingnaert via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 7 13:33:35 PST 2020
thomasfaingnaert created this revision.
thomasfaingnaert added a reviewer: jholewinski.
Herald added a subscriber: hiraditya.
Herald added a project: LLVM.
The issue
=========
Consider the two following LLVM IR functions, both of which just store a value at an offset from a pointer. The only difference is the order of the `GEP` and `addrspacecast`:
target triple = "nvptx64-nvidia-cuda"
define void @bad(i64) {
%ptr = inttoptr i64 %0 to i16*
%gep = getelementptr i16, i16* %ptr, i64 16
%asc = addrspacecast i16* %gep to i16 addrspace(1)*
store i16 0, i16 addrspace(1)* %asc, align 16
ret void
}
define void @good(i64) {
%ptr = inttoptr i64 %0 to i16*
%asc = addrspacecast i16* %ptr to i16 addrspace(1)*
%gep = getelementptr i16, i16 addrspace(1)* %asc, i64 16
store i16 0, i16 addrspace(1)* %gep, align 16
ret void
}
This gets compiled to the following PTX by the NVPTX backend:
//
// Generated by LLVM NVPTX Back-End
//
.version 3.2
.target sm_20
.address_size 64
// .globl bad // -- Begin function bad
// @bad
.visible .func bad(
.param .b64 bad_param_0
)
{
.reg .b16 %rs<2>;
.reg .b64 %rd<4>;
// %bb.0:
ld.param.u64 %rd1, [bad_param_0];
add.s64 %rd2, %rd1, 32;
cvta.to.global.u64 %rd3, %rd2;
mov.u16 %rs1, 0;
st.global.u16 [%rd3], %rs1;
ret;
// -- End function
}
// .globl good // -- Begin function good
.visible .func good(
.param .b64 good_param_0
) // @good
{
.reg .b16 %rs<2>;
.reg .b64 %rd<3>;
// %bb.0:
ld.param.u64 %rd1, [good_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u16 %rs1, 0;
st.global.u16 [%rd2+32], %rs1;
ret;
// -- End function
}
In the case where the `GEP` precedes the `addrspacecast` (the `bad` function), the backend emits an explicit `ADD` instruction, rather than moving the addition to the addressing mode of the store.
This is because instruction selection doesn't take into account possible `addrspacecast`s.
Proposed fix
============
To fix this, I check if the address is an `AddrSpaceCastSDNode`, and if so, if there's an `ADD` node behind it.
In that case, I transform `addrspacecast(add(x, y))` to `add(addrspacecast(x), y)`, allowing the `ADD` to be fused in the memory operation.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D75817
Files:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/test/CodeGen/NVPTX/addrspace-offsets.ll
Index: llvm/test/CodeGen/NVPTX/addrspace-offsets.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/NVPTX/addrspace-offsets.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @addrspacecast_offset(i64) {
+ %ptr = inttoptr i64 %0 to i16*
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ %gep1 = getelementptr i16, i16* %ptr, i64 0
+ %asc1 = addrspacecast i16* %gep1 to i16 addrspace(1)*
+ store i16 0, i16 addrspace(1)* %asc1, align 16
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}+32], %rs{{[0-9]+}}
+ %gep2 = getelementptr i16, i16* %ptr, i64 16
+ %asc2 = addrspacecast i16* %gep2 to i16 addrspace(1)*
+ store i16 0, i16 addrspace(1)* %asc2, align 16
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}+64], %rs{{[0-9]+}}
+ %gep3 = getelementptr i16, i16* %ptr, i64 32
+ %asc3 = addrspacecast i16* %gep3 to i16 addrspace(1)*
+ store i16 0, i16 addrspace(1)* %asc3, align 16
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}+96], %rs{{[0-9]+}}
+ %gep4 = getelementptr i16, i16* %ptr, i64 48
+ %asc4 = addrspacecast i16* %gep4 to i16 addrspace(1)*
+ store i16 0, i16 addrspace(1)* %asc4, align 16
+
+ ret void
+}
Index: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3657,6 +3657,37 @@
return true;
}
}
+
+ // Handle addrspacecast nodes
+ if (AddrSpaceCastSDNode *ASCN = dyn_cast<AddrSpaceCastSDNode>(Addr)) {
+ // Check if there's an add behind the addrspacecast
+ SDNode *ADDN = dyn_cast<SDNode>(ASCN->getOperand(0));
+
+ if (ADDN && (ADDN->getOpcode() == ISD::ADD)) {
+ // Yes, so create new addrspacecast and add nodes.
+ // We transform addrspacecast(add(x, y)) to add(addrspacecast(x), y).
+ SDValue new_ASCN = CurDAG->getAddrSpaceCast(
+ SDLoc(ASCN), ASCN->getValueType(0), ADDN->getOperand(0),
+ ASCN->getSrcAddressSpace(), ASCN->getDestAddressSpace());
+ SDValue new_ADDN =
+ CurDAG->getNode(ISD::ADD, SDLoc(ADDN), ADDN->getValueType(0),
+ new_ASCN, ADDN->getOperand(1));
+
+ // Replace the old addrspacecast by the new add, effectively swapping the
+ // order of the addrspacecast and add.
+ ReplaceUses(Addr, new_ADDN);
+
+ // Instruction selection is not called for this new addrspacecast node, so
+ // call it manually.
+ SelectAddrSpaceCast(cast<SDNode>(new_ASCN));
+
+ // Finally, run instruction selection for the address on these new nodes.
+ // This will trigger the add case above, and move the addition to the
+ // addressing mode for the memory operation.
+ return SelectADDRri_imp(OpNode, new_ADDN, Base, Offset, mvt);
+ }
+ }
+
return false;
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D75817.248964.patch
Type: text/x-patch
Size: 2901 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200307/0bcc3e19/attachment.bin>
More information about the llvm-commits
mailing list