[PATCH] D75817: [NVPTX] Fix instruction selection for addresses in case of addrspacecasts

Sat Mar 7 13:33:35 PST 2020

thomasfaingnaert created this revision.
thomasfaingnaert added a reviewer: jholewinski.
Herald added a subscriber: hiraditya.
Herald added a project: LLVM.

The issue
=========

Consider the two following LLVM IR functions, both of which just store a value at an offset from a pointer. The only difference is the order of the `GEP` and `addrspacecast`:

  target triple = "nvptx64-nvidia-cuda"
  
  define void @bad(i64) {
    %ptr = inttoptr i64 %0 to i16*
  
    %gep = getelementptr i16, i16* %ptr, i64 16
    %asc = addrspacecast i16* %gep to i16 addrspace(1)*
    store i16 0, i16 addrspace(1)* %asc, align 16
  
    ret void
  }
  
  define void @good(i64) {
    %ptr = inttoptr i64 %0 to i16*
  
    %asc = addrspacecast i16* %ptr to i16 addrspace(1)*
    %gep = getelementptr i16, i16 addrspace(1)* %asc, i64 16
    store i16 0, i16 addrspace(1)* %gep, align 16
  
    ret void
  }

This gets compiled to the following PTX by the NVPTX backend:

  //
  // Generated by LLVM NVPTX Back-End
  //
  
  .version 3.2
  .target sm_20
  .address_size 64
  
  	// .globl	bad             // -- Begin function bad
                                          // @bad
  .visible .func bad(
  	.param .b64 bad_param_0
  )
  {
  	.reg .b16 	%rs<2>;
  	.reg .b64 	%rd<4>;
  
  // %bb.0:
  	ld.param.u64 	%rd1, [bad_param_0];
  	add.s64 	%rd2, %rd1, 32;
  	cvta.to.global.u64 	%rd3, %rd2;
  	mov.u16 	%rs1, 0;
  	st.global.u16 	[%rd3], %rs1;
  	ret;
                                          // -- End function
  }
  	// .globl	good            // -- Begin function good
  .visible .func good(
  	.param .b64 good_param_0
  )                                       // @good
  {
  	.reg .b16 	%rs<2>;
  	.reg .b64 	%rd<3>;
  
  // %bb.0:
  	ld.param.u64 	%rd1, [good_param_0];
  	cvta.to.global.u64 	%rd2, %rd1;
  	mov.u16 	%rs1, 0;
  	st.global.u16 	[%rd2+32], %rs1;
  	ret;
                                          // -- End function
  }

In the case where the `GEP` precedes the `addrspacecast` (the `bad` function), the backend emits an explicit `ADD` instruction, rather than moving the addition to the addressing mode of the store.
This is because instruction selection doesn't take into account possible `addrspacecast`s.

Proposed fix
============

To fix this, I check if the address is an `AddrSpaceCastSDNode`, and if so, if there's an `ADD` node behind it.
In that case, I transform `addrspacecast(add(x, y))` to `add(addrspacecast(x), y)`, allowing the `ADD` to be fused in the memory operation.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D75817

Files:
  llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
  llvm/test/CodeGen/NVPTX/addrspace-offsets.ll


Index: llvm/test/CodeGen/NVPTX/addrspace-offsets.ll
===================================================================

--- /dev/null
+++ llvm/test/CodeGen/NVPTX/addrspace-offsets.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @addrspacecast_offset(i64) {
+  %ptr = inttoptr i64 %0 to i16*
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+  %gep1 = getelementptr i16, i16* %ptr, i64 0
+  %asc1 = addrspacecast i16* %gep1 to i16 addrspace(1)*
+  store i16 0, i16 addrspace(1)* %asc1, align 16
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}+32], %rs{{[0-9]+}}
+  %gep2 = getelementptr i16, i16* %ptr, i64 16
+  %asc2 = addrspacecast i16* %gep2 to i16 addrspace(1)*
+  store i16 0, i16 addrspace(1)* %asc2, align 16
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}+64], %rs{{[0-9]+}}
+  %gep3 = getelementptr i16, i16* %ptr, i64 32
+  %asc3 = addrspacecast i16* %gep3 to i16 addrspace(1)*
+  store i16 0, i16 addrspace(1)* %asc3, align 16
+
+; CHECK: st.global.u16 [%rd{{[0-9]+}}+96], %rs{{[0-9]+}}
+  %gep4 = getelementptr i16, i16* %ptr, i64 48
+  %asc4 = addrspacecast i16* %gep4 to i16 addrspace(1)*
+  store i16 0, i16 addrspace(1)* %asc4, align 16
+
+  ret void
+}
Index: llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3657,6 +3657,37 @@
       return true;
     }
   }
+
+  // Handle addrspacecast nodes
+  if (AddrSpaceCastSDNode *ASCN = dyn_cast<AddrSpaceCastSDNode>(Addr)) {
+    // Check if there's an add behind the addrspacecast
+    SDNode *ADDN = dyn_cast<SDNode>(ASCN->getOperand(0));
+
+    if (ADDN && (ADDN->getOpcode() == ISD::ADD)) {
+      // Yes, so create new addrspacecast and add nodes.
+      // We transform addrspacecast(add(x, y)) to add(addrspacecast(x), y).
+      SDValue new_ASCN = CurDAG->getAddrSpaceCast(
+          SDLoc(ASCN), ASCN->getValueType(0), ADDN->getOperand(0),
+          ASCN->getSrcAddressSpace(), ASCN->getDestAddressSpace());
+      SDValue new_ADDN =
+          CurDAG->getNode(ISD::ADD, SDLoc(ADDN), ADDN->getValueType(0),
+                          new_ASCN, ADDN->getOperand(1));
+
+      // Replace the old addrspacecast by the new add, effectively swapping the
+      // order of the addrspacecast and add.
+      ReplaceUses(Addr, new_ADDN);
+
+      // Instruction selection is not called for this new addrspacecast node, so
+      // call it manually.
+      SelectAddrSpaceCast(cast<SDNode>(new_ASCN));
+
+      // Finally, run instruction selection for the address on these new nodes.
+      // This will trigger the add case above, and move the addition to the
+      // addressing mode for the memory operation.
+      return SelectADDRri_imp(OpNode, new_ADDN, Base, Offset, mvt);
+    }
+  }
+
   return false;
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D75817.248964.patch
Type: text/x-patch
Size: 2901 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200307/0bcc3e19/attachment.bin>