[llvm] r266126 - AMDGPU: add llvm.amdgcn.buffer.load/store intrinsics

Nicolai Haehnle via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 12 14:18:10 PDT 2016


Author: nha
Date: Tue Apr 12 16:18:10 2016
New Revision: 266126

URL: http://llvm.org/viewvc/llvm-project?rev=266126&view=rev
Log:
AMDGPU: add llvm.amdgcn.buffer.load/store intrinsics

Summary:
They correspond to BUFFER_LOAD/STORE_DWORD[_X2,X3,X4] and mostly behave like
llvm.amdgcn.buffer.load/store.format. They will be used by Mesa for SSBO and
atomic counters at least when robust buffer access behavior is desired.
(These instructions perform no format conversion and do buffer range checking
per component.)

As a side effect of sharing patterns with llvm.amdgcn.buffer.store.format,
it has become trivial to add support for the f32 and v2f32 variants of that
intrinsic, so the patch does so.

Also DAG-ify (and fix) some tests that I noticed intermittent failures in
while developing this patch.

Some tests were (temporarily) adjusted for the required mayLoad/hasSideEffects
changes to the BUFFER_STORE_DWORD* instructions. See also
http://reviews.llvm.org/D18291.

Reviewers: arsenm, tstellarAMD, mareko

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18292

Added:
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
Modified:
    llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll
    llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
    llvm/trunk/test/CodeGen/AMDGPU/sminmax.ll

Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Tue Apr 12 16:18:10 2016
@@ -224,7 +224,7 @@ def int_amdgcn_image_atomic_cmpswap : In
    llvm_i1_ty],       // slc(imm)
   []>;
 
-def int_amdgcn_buffer_load_format : Intrinsic <
+class AMDGPUBufferLoad : Intrinsic <
   [llvm_anyfloat_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
@@ -232,16 +232,20 @@ def int_amdgcn_buffer_load_format : Intr
    llvm_i1_ty,        // glc(imm)
    llvm_i1_ty],       // slc(imm)
   [IntrReadMem]>;
+def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
+def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
-def int_amdgcn_buffer_store_format : Intrinsic <
+class AMDGPUBufferStore : Intrinsic <
   [],
-  [llvm_anyfloat_ty,  // vdata(VGPR) -- can currently only select v4f32
+  [llvm_anyfloat_ty,  // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
    llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // vindex(VGPR)
    llvm_i32_ty,       // offset(SGPR/VGPR/imm)
    llvm_i1_ty,        // glc(imm)
    llvm_i1_ty],       // slc(imm)
   []>;
+def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
+def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
 class AMDGPUBufferAtomic : Intrinsic <
   [llvm_i32_ty],

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Tue Apr 12 16:18:10 2016
@@ -996,6 +996,11 @@ defm BUFFER_STORE_SHORT : MUBUF_Store_He
   mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
 >;
 
+// Without mayLoad and hasSideEffects, TableGen complains about the pattern
+// matching llvm.amdgcn.buffer.store. Eventually, we'll want a WriteOnly
+// property to express the effects of this intrinsic more precisely, see
+// http://reviews.llvm.org/D18291
+let mayLoad = 1, hasSideEffects = 1 in {
 defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
   mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
 >;
@@ -1007,6 +1012,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Store_
 defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
   mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
 >;
+}
 
 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
   mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
@@ -2140,41 +2146,36 @@ def : Pat <
 // buffer_load/store_format patterns
 //===----------------------------------------------------------------------===//
 
-multiclass MUBUF_LoadIntrinsicPat<ValueType vt, string opcode> {
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
   def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
-                                       (MUBUFIntrinsicOffset i32:$soffset,
-                                                             i16:$offset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
     (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
       (as_i1imm $glc), (as_i1imm $slc), 0)
   >;
 
   def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
-                                       (MUBUFIntrinsicOffset i32:$soffset,
-                                                             i16:$offset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
     (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
       (as_i1imm $glc), (as_i1imm $slc), 0)
   >;
 
   def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
-                                       (MUBUFIntrinsicVOffset i32:$soffset,
-                                                              i16:$offset,
-                                                              i32:$voffset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
     (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
       (as_i1imm $glc), (as_i1imm $slc), 0)
   >;
 
   def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
-                                       (MUBUFIntrinsicVOffset i32:$soffset,
-                                                              i16:$offset,
-                                                              i32:$voffset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
     (!cast<MUBUF>(opcode # _BOTHEN)
       (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
       $rsrc, $soffset, (as_i16imm $offset),
@@ -2182,50 +2183,59 @@ multiclass MUBUF_LoadIntrinsicPat<ValueT
   >;
 }
 
-defm : MUBUF_LoadIntrinsicPat<f32, "BUFFER_LOAD_FORMAT_X">;
-defm : MUBUF_LoadIntrinsicPat<v2f32, "BUFFER_LOAD_FORMAT_XY">;
-defm : MUBUF_LoadIntrinsicPat<v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
 
-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,
-                                  (MUBUFIntrinsicOffset i32:$soffset,
-                                                        i16:$offset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_OFFSET $vdata, $rsrc, $soffset, (as_i16imm $offset),
-    (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+                                    (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
 
-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex,
-                                  (MUBUFIntrinsicOffset i32:$soffset,
-                                                        i16:$offset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_IDXEN $vdata, $vindex, $rsrc, $soffset,
-    (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
 
-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,
-                                  (MUBUFIntrinsicVOffset i32:$soffset,
-                                                         i16:$offset,
-                                                         i32:$voffset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_OFFEN $vdata, $voffset, $rsrc, $soffset,
-    (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
 
-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex,
-                                  (MUBUFIntrinsicVOffset i32:$soffset,
-                                                         i16:$offset,
-                                                         i32:$voffset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_BOTHEN
-    $vdata,
-    (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-    $rsrc, $soffset, (as_i16imm $offset),
-    (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _BOTHEN)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
 
 //===----------------------------------------------------------------------===//
 // buffer_atomic patterns

Modified: llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll Tue Apr 12 16:18:10 2016
@@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
-; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]

Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll Tue Apr 12 16:18:10 2016
@@ -33,8 +33,8 @@ define void @load_v2i8_to_v2f32(<2 x flo
 ; SI-NOT: bfe
 ; SI-NOT: v_cvt_f32_ubyte3_e32
 ; SI-DAG: v_cvt_f32_ubyte2_e32
-; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]],
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/extractelt-to-trunc.ll Tue Apr 12 16:18:10 2016
@@ -32,7 +32,6 @@ define void @bitcast_fp_to_vector_extrac
 ; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0:
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
-; GCN: v_addc_u32
 ; GCN: buffer_store_dword
 define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in

Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll?rev=266126&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll Tue Apr 12 16:18:10 2016
@@ -0,0 +1,108 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_load:
+;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0
+;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc
+;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
+  %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
+  %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
+  ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs:
+;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
+;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_idx:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+main_body:
+  %ofs = add i32 %1, 58
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both:
+;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both_reversed:
+;CHECK: v_mov_b32_e32 v2, v0
+;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x1:
+;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+  ret float %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x2:
+;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+  ret <2 x float> %data
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll Tue Apr 12 16:18:10 2016
@@ -70,6 +70,24 @@ main_body:
   ret void
 }
 
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
 

Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll?rev=266126&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll Tue Apr 12 16:18:10 2016
@@ -0,0 +1,95 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_store:
+;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0
+;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc
+;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }

Modified: llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll Tue Apr 12 16:18:10 2016
@@ -11,8 +11,8 @@ declare void @llvm.amdgcn.s.barrier() #1
 
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI-NEXT: buffer_store_dword
 ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
-; CI: buffer_store_dword
 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
@@ -71,9 +71,9 @@ define void @no_reorder_barrier_local_lo
 }
 
 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
-; CI: buffer_store_dword
 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: buffer_store_dword
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
 ; CI: buffer_store_dword
@@ -184,11 +184,11 @@ define void @reorder_local_offsets(i32 a
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
 ; CI: buffer_store_dword
 ; CI: s_endpgm

Modified: llvm/trunk/test/CodeGen/AMDGPU/sminmax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sminmax.ll?rev=266126&r1=266125&r2=266126&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sminmax.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sminmax.ll Tue Apr 12 16:18:10 2016
@@ -46,11 +46,11 @@ define void @s_abs_v2i32(<2 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_abs_v2i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
 
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
 
 ; GCN: v_add_i32
 ; GCN: v_add_i32
@@ -97,15 +97,15 @@ define void @s_abs_v4i32(<4 x i32> addrs
 }
 
 ; FUNC-LABEL: {{^}}v_abs_v4i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
-
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
+
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
 
 ; GCN: v_add_i32
 ; GCN: v_add_i32




More information about the llvm-commits mailing list