[llvm] r216279 - R600/SI: Use READ2/WRITE2 instructions for 64-bit mem ops with 32-bit alignment

Tom Stellard thomas.stellard at amd.com
Fri Aug 22 11:49:35 PDT 2014


Author: tstellar
Date: Fri Aug 22 13:49:35 2014
New Revision: 216279

URL: http://llvm.org/viewvc/llvm-project?rev=216279&view=rev
Log:
R600/SI: Use READ2/WRITE2 instructions for 64-bit mem ops with 32-bit alignment

Modified:
    llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
    llvm/trunk/lib/Target/R600/AMDGPUInstructions.td
    llvm/trunk/lib/Target/R600/SIInstrInfo.td
    llvm/trunk/lib/Target/R600/SIInstructions.td
    llvm/trunk/test/CodeGen/R600/unaligned-load-store.ll

Modified: llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp?rev=216279&r1=216278&r2=216279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUISelDAGToDAG.cpp Fri Aug 22 13:49:35 2014
@@ -91,6 +91,8 @@ private:
   bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
                        unsigned OffsetBits) const;
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                 SDValue &Offset1) const;
   void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
@@ -782,6 +784,31 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1O
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+                                                   SDValue &Offset0,
+                                                   SDValue &Offset1) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    unsigned DWordOffset0 = C1->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    // (add n0, c0)
+    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+      Base = N0;
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8);
+      return true;
+    }
+  }
+
+  // default case
+  Base = Addr;
+  Offset0 = CurDAG->getTargetConstant(0, MVT::i8);
+  Offset1 = CurDAG->getTargetConstant(1, MVT::i8);
+  return true;
+}
+
 static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
   return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32,
                                      Ptr), 0);

Modified: llvm/trunk/lib/Target/R600/AMDGPUInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/AMDGPUInstructions.td?rev=216279&r1=216278&r2=216279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/AMDGPUInstructions.td (original)
+++ llvm/trunk/lib/Target/R600/AMDGPUInstructions.td Fri Aug 22 13:49:35 2014
@@ -282,6 +282,17 @@ def local_load : PatFrag<(ops node:$ptr)
     return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+    return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
+}]>;
+
+def local_load_aligned8bytes : Aligned8Bytes <
+  (ops node:$ptr), (local_load node:$ptr)
+>;
+
+def local_store_aligned8bytes : Aligned8Bytes <
+  (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+>;
 
 class local_binary_atomic_op<SDNode atomic_op> :
   PatFrag<(ops node:$ptr, node:$value),

Modified: llvm/trunk/lib/Target/R600/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstrInfo.td?rev=216279&r1=216278&r2=216279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/R600/SIInstrInfo.td Fri Aug 22 13:49:35 2014
@@ -192,6 +192,7 @@ def tfe : Operand <i1> {
 //===----------------------------------------------------------------------===//
 
 def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
 
 def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;

Modified: llvm/trunk/lib/Target/R600/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/R600/SIInstructions.td?rev=216279&r1=216278&r2=216279&view=diff
==============================================================================
--- llvm/trunk/lib/Target/R600/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/R600/SIInstructions.td Fri Aug 22 13:49:35 2014
@@ -2530,7 +2530,18 @@ def : DSReadPat <DS_READ_U8,  i32, az_ex
 def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
 def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
 def : DSReadPat <DS_READ_B32, i32, local_load>;
-def : DSReadPat <DS_READ_B64, v2i32, local_load>;
+
+let AddedComplexity = 100 in {
+
+def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
+
+} // End AddedComplexity = 100
+
+def : Pat <
+  (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                    i8:$offset1))),
+  (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1)
+>;
 
 class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
   (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
@@ -2540,7 +2551,18 @@ class DSWritePat <DS inst, ValueType vt,
 def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
 def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
 def : DSWritePat <DS_WRITE_B32, i32, local_store>;
-def : DSWritePat <DS_WRITE_B64, v2i32, local_store>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>;
+} // End AddedComplexity = 100
+
+def : Pat <
+  (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                            i8:$offset1)),
+  (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0),
+                        (EXTRACT_SUBREG $value, sub1), $offset0, $offset1)
+>;
 
 multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
   def : Pat <

Modified: llvm/trunk/test/CodeGen/R600/unaligned-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/R600/unaligned-load-store.ll?rev=216279&r1=216278&r2=216279&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/R600/unaligned-load-store.ll (original)
+++ llvm/trunk/test/CodeGen/R600/unaligned-load-store.ll Fri Aug 22 13:49:35 2014
@@ -32,9 +32,8 @@ define void @unaligned_load_store_v4i32(
   ret void
 }
 
-; FIXME: This should use ds_read2_b32
 ; SI-LABEL: @load_lds_i64_align_4
-; SI: DS_READ_B64
+; SI: DS_READ2_B32
 ; SI: S_ENDPGM
 define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64 addrspace(3)* %in, align 4
@@ -42,9 +41,61 @@ define void @load_lds_i64_align_4(i64 ad
   ret void
 }
 
+; SI-LABEL: @load_lds_i64_align_4_with_offset
+; SI: DS_READ2_B32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}}, 0x8, 0x9
+; SI: S_ENDPGM
+define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = getelementptr i64 addrspace(3)* %in, i32 4
+  %val = load i64 addrspace(3)* %ptr, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @load_lds_i64_align_4_with_split_offset
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: DS_READ2_B32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}}, 0x0, 0x1
+; SI: S_ENDPGM
+define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
+  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  %val = load i64 addrspace(3)* %ptri64, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FIXME: Need to fix this case.
 ; define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
 ;   %val = load i64 addrspace(3)* %in, align 1
 ;   store i64 %val, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
+
+; SI-LABEL: @store_lds_i64_align_4
+; SI: DS_WRITE2_B32
+; SI: S_ENDPGM
+define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+  store i64 %val, i64 addrspace(3)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @store_lds_i64_align_4_with_offset
+; DS_WRITE_B32 v[{{[0-9]+}}], v[{{[0-9]+}}], v{{[0-9]}}, 0x9, 0x9
+; SI: S_ENDPGM
+define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = getelementptr i64 addrspace(3)* %out, i32 4
+  store i64 0, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+; SI-LABEL: @store_lds_i64_align_4_with_split_offset
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; DS_WRITE_B32 v[{{[0-9]+}}], v[{{[0-9]+}}], v{{[0-9]}}, 0x0, 0x1
+; SI: S_ENDPGM
+define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
+  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  store i64 0, i64 addrspace(3)* %out, align 4
+  ret void
+}





More information about the llvm-commits mailing list