[llvm] r253974 - AMDGPU: Split x8 and x16 vector loads instead of scalarize

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 24 04:05:05 PST 2015


Author: arsenm
Date: Tue Nov 24 06:05:03 2015
New Revision: 253974

URL: http://llvm.org/viewvc/llvm-project?rev=253974&view=rev
Log:
AMDGPU: Split x8 and x16 vector loads instead of scalarize

The one regression in the builtin tests is in the read2 test which now
(again) has many extra copies, but this should be solved once the pass
is replaced with a DAG combine.

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll
    llvm/trunk/test/CodeGen/AMDGPU/global-extload-i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/half.ll
    llvm/trunk/test/CodeGen/AMDGPU/load.ll
    llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
    llvm/trunk/test/CodeGen/AMDGPU/reorder-stores.ll
    llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Tue Nov 24 06:05:03 2015
@@ -394,6 +394,16 @@ AMDGPUTargetLowering::AMDGPUTargetLoweri
 
   setFsqrtIsCheap(true);
 
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Tue Nov 24 06:05:03 2015
@@ -1178,10 +1178,14 @@ SDValue SITargetLowering::LowerLOAD(SDVa
            "Custom lowering for non-i32 vectors hasn't been implemented.");
     unsigned NumElements = Op.getValueType().getVectorNumElements();
     assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
     switch (Load->getAddressSpace()) {
       default: break;
       case AMDGPUAS::GLOBAL_ADDRESS:
       case AMDGPUAS::PRIVATE_ADDRESS:
+        if (NumElements >= 8)
+          return SplitVectorLoad(Op, DAG);
+
         // v4 loads are supported for private and global memory.
         if (NumElements <= 4)
           break;
@@ -1409,7 +1413,7 @@ SDValue SITargetLowering::LowerSTORE(SDV
     return Ret;
 
   if (VT.isVector() && VT.getVectorNumElements() >= 8)
-      return ScalarizeVectorStore(Op, DAG);
+      return SplitVectorStore(Op, DAG);
 
   if (VT == MVT::i1)
     return DAG.getTruncStore(Store->getChain(), DL,

Modified: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll Tue Nov 24 06:05:03 2015
@@ -137,14 +137,8 @@ define void @load_v7i8_to_v7f32(<7 x flo
 ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
 ; SI-NOT: bfe
 ; SI-NOT: lshr
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll Tue Nov 24 06:05:03 2015
@@ -116,19 +116,18 @@ define void @simple_read2_v4f32_superreg
   ret void
 }
 
+; FIXME: Extra moves shuffling superregister
 ; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT4:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
+; CI: v_mov_b32
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
 ; CI: s_endpgm
 define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -139,41 +138,30 @@ define void @simple_read2_v8f32_superreg
   ret void
 }
 
+; FIXME: Extra moves shuffling superregister
 ; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
-; CI-NOT: v_mov_b32
-; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
-; CI-NOT: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:15{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT9:[0-9]+]]:[[REG_ELT8:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:10 offset1:9{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:14 offset1:13{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:12 offset1:8{{$}}
+; CI: v_mov_b32
+; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
+; CI: v_mov_b32
 
 ; CI: s_waitcnt lgkmcnt(0)
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
-; CI: buffer_store_dword
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
+; CI: buffer_store_dwordx4
 ; CI: s_endpgm
 define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.r600.read.tidig.x() #1

Modified: llvm/trunk/test/CodeGen/AMDGPU/global-extload-i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/global-extload-i32.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/global-extload-i32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/global-extload-i32.ll Tue Nov 24 06:05:03 2015
@@ -106,14 +106,8 @@ define void @sextload_global_v4i32_to_v4
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 ; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dwordx2
@@ -131,14 +125,8 @@ define void @zextload_global_v8i32_to_v8
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
@@ -166,22 +154,10 @@ define void @sextload_global_v8i32_to_v8
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
@@ -219,22 +195,10 @@ define void @sextload_global_v16i32_to_v
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
@@ -262,41 +226,15 @@ define void @zextload_global_v16i32_to_v
 }
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+
 
 ; SI-DAG: v_ashrrev_i32
 ; SI-DAG: v_ashrrev_i32
@@ -376,41 +314,14 @@ define void @sextload_global_v32i32_to_v
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
 ; SI-DAG: buffer_store_dwordx2
 ; SI-DAG: buffer_store_dwordx2

Modified: llvm/trunk/test/CodeGen/AMDGPU/half.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/half.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/half.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/half.ll Tue Nov 24 06:05:03 2015
@@ -105,6 +105,26 @@ define void @extload_v4f16_to_v4f32_arg(
 }
 
 ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, <8 x float> addrspace(1)* %out
@@ -298,6 +318,46 @@ define void @global_extload_v8f16_to_v8f
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+; GCN: buffer_load_ushort
+
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+; GCN: v_cvt_f32_f16_e32
+
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: s_endpgm
 define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x float>
@@ -426,14 +486,8 @@ define void @global_truncstore_v4f32_to_
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: v_cvt_f16_f32_e32
@@ -459,22 +513,10 @@ define void @global_truncstore_v8f32_to_
 }
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
-; GCN: buffer_load_dword
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
 ; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN-DAG: v_cvt_f16_f32_e32

Modified: llvm/trunk/test/CodeGen/AMDGPU/load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/load.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/load.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/load.ll Tue Nov 24 06:05:03 2015
@@ -277,15 +277,9 @@ entry:
 ; FUNC-LABEL: {{^}}load_v8i32:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-; XXX: We should be using DWORDX4 instructions on SI.
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -298,23 +292,11 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-; XXX: We should be using DWORDX4 instructions on SI.
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in

Modified: llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll Tue Nov 24 06:05:03 2015
@@ -613,22 +613,9 @@ define void @merge_global_store_7_consta
   ret void
 }
 
-; FIXME: This should do 2 dwordx4 loads
 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
-
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-
-; GCN-AA: buffer_store_dwordx4
-; GCN-AA: buffer_store_dwordx2
-; GCN-AA: buffer_store_dwordx2
-
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
 define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/reorder-stores.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/reorder-stores.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/reorder-stores.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/reorder-stores.ll Tue Nov 24 06:05:03 2015
@@ -34,46 +34,16 @@ define void @no_reorder_scalarized_v2f64
 }
 
 ; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
 
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
 
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-; SI: buffer_load_dword
-
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
-; SI: buffer_store_dword
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 ; SI: s_endpgm
 define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32

Modified: llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll?rev=253974&r1=253973&r2=253974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll Tue Nov 24 06:05:03 2015
@@ -162,14 +162,8 @@ entry:
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
 define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -184,7 +178,7 @@ entry:
 ; FIXME: should use immediate offset instead of using s_add_i32 for adding to constant.
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
 
-; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
+; GCN-DAG: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
 ; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16
 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}}
 
@@ -197,6 +191,7 @@ entry:
 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}}
 ; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16
 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}}
+
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@@ -205,14 +200,12 @@ entry:
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
-; GCN: buffer_store_dword
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+
+; GCN: s_endpgm
 define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x() #0




More information about the llvm-commits mailing list