[llvm] r275870 - AMDGPU/R600: Replace barrier intrinsics

Mon Jul 18 13:11:06 PDT 2016

Thanks for the heads up! r275896.

Cheers,
Hans

On Mon, Jul 18, 2016 at 11:52 AM, Matt Arsenault via llvm-commits
<llvm-commits at lists.llvm.org> wrote:
> It looks like I just missed the branch point for this. This will need to be cherry picked to avoid breaking the libclc build
>
> -Matt
>
>> On Jul 18, 2016, at 11:35, Matt Arsenault via llvm-commits <llvm-commits at lists.llvm.org> wrote:
>>
>> Author: arsenm
>> Date: Mon Jul 18 13:34:59 2016
>> New Revision: 275870
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=275870&view=rev
>> Log:
>> AMDGPU/R600: Replace barrier intrinsics
>>
>> Added:
>>    llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
>>    llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
>>      - copied, changed from r275869, llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll
>>    llvm/trunk/test/CodeGen/AMDGPU/local-memory.r600.ll
>>      - copied, changed from r275869, llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll
>> Removed:
>>    llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
>>    llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
>>    llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll
>> Modified:
>>    llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
>>    llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsics.td
>>    llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td
>>    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
>>    llvm/trunk/test/CodeGen/AMDGPU/lds-output-queue.ll
>>    llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll
>>    llvm/trunk/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
>>
>> Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
>> +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Mon Jul 18 13:34:59 2016
>> @@ -43,6 +43,8 @@ defm int_r600_read_tidig : AMDGPUReadPre
>>
>> def int_r600_read_workdim : AMDGPUReadPreloadRegisterIntrinsic;
>>
>> +def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
>> +  Intrinsic<[], [], [IntrConvergent]>;
>>
>> // AS 7 is PARAM_I_ADDRESS, used for kernel arguments
>> def int_r600_implicitarg_ptr :
>>
>> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsics.td
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsics.td?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsics.td (original)
>> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUIntrinsics.td Mon Jul 18 13:34:59 2016
>> @@ -30,10 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget =
>>     [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
>>> ;
>>
>> -  // Deprecated in favor of llvm.amdgcn.s.barrier
>> -  def int_AMDGPU_barrier_local  : Intrinsic<[], [], [IntrConvergent]>;
>> -  def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
>> -
>>   // Deprecated in favor of llvm.amdgcn.read.workdim
>>   def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
>> }
>>
>> Modified: llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td (original)
>> +++ llvm/trunk/lib/Target/AMDGPU/EvergreenInstructions.td Mon Jul 18 13:34:59 2016
>> @@ -394,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<
>> def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
>>
>> def GROUP_BARRIER : InstR600 <
>> -    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
>> +    (outs), (ins), "  GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
>>     R600ALU_Word0,
>>     R600ALU_Word1_OP2 <0x54> {
>>
>> @@ -423,11 +423,6 @@ def GROUP_BARRIER : InstR600 <
>>   let ALUInst = 1;
>> }
>>
>> -def : Pat <
>> -     (int_AMDGPU_barrier_global),
>> -     (GROUP_BARRIER)
>> ->;
>> -
>> //===----------------------------------------------------------------------===//
>> // LDS Instructions
>> //===----------------------------------------------------------------------===//
>>
>> Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
>> +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon Jul 18 13:34:59 2016
>> @@ -2453,17 +2453,6 @@ def : Pat <
>>   (S_WAITCNT (as_i16imm $simm16))
>>> ;
>>
>> -// FIXME: These should be removed eventually
>> -def : Pat <
>> -  (int_AMDGPU_barrier_global),
>> -  (S_BARRIER)
>> ->;
>> -
>> -def : Pat <
>> -  (int_AMDGPU_barrier_local),
>> -  (S_BARRIER)
>> ->;
>> -
>> //===----------------------------------------------------------------------===//
>> // VOP1 Patterns
>> //===----------------------------------------------------------------------===//
>>
>> Modified: llvm/trunk/test/CodeGen/AMDGPU/lds-output-queue.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lds-output-queue.ll?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/lds-output-queue.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/lds-output-queue.ll Mon Jul 18 13:34:59 2016
>> @@ -1,4 +1,4 @@
>> -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
>> +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
>> ;
>> ; This test checks that the lds input queue will is empty at the end of
>> ; the ALU clause.
>> @@ -14,7 +14,7 @@ define void @lds_input_queue(i32 addrspa
>> entry:
>>   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
>>   %1 = load i32, i32 addrspace(3)* %0
>> -  call void @llvm.AMDGPU.barrier.local()
>> +  call void @llvm.r600.group.barrier()
>>
>>   ; This will start a new clause for the vertex fetch
>>   %2 = load i32, i32 addrspace(1)* %in
>> @@ -23,7 +23,7 @@ entry:
>>   ret void
>> }
>>
>> -declare void @llvm.AMDGPU.barrier.local()
>> +declare void @llvm.r600.group.barrier() nounwind convergent
>>
>> ; The machine scheduler does not do proper alias analysis and assumes that
>> ; loads from global values (Note that a global value is different that a
>>
>> Removed: llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll?rev=275869&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll (removed)
>> @@ -1,30 +0,0 @@
>> -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
>> -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>> -
>> -; FUNC-LABEL: {{^}}test_barrier_global:
>> -; EG: GROUP_BARRIER
>> -; SI: buffer_store_dword
>> -; SI: s_waitcnt
>> -; SI: s_barrier
>> -
>> -define void @test_barrier_global(i32 addrspace(1)* %out) {
>> -entry:
>> -  %0 = call i32 @llvm.r600.read.tidig.x()
>> -  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
>> -  store i32 %0, i32 addrspace(1)* %1
>> -  call void @llvm.AMDGPU.barrier.global()
>> -  %2 = call i32 @llvm.r600.read.local.size.x()
>> -  %3 = sub i32 %2, 1
>> -  %4 = sub i32 %3, %0
>> -  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
>> -  %6 = load i32, i32 addrspace(1)* %5
>> -  store i32 %6, i32 addrspace(1)* %1
>> -  ret void
>> -}
>> -
>> -declare void @llvm.AMDGPU.barrier.global()
>> -
>> -declare i32 @llvm.r600.read.tidig.x() #0
>> -declare i32 @llvm.r600.read.local.size.x() #0
>> -
>> -attributes #0 = { readnone }
>>
>> Removed: llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll?rev=275869&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll (removed)
>> @@ -1,31 +0,0 @@
>> -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
>> -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>> -
>> -; FUNC-LABEL: {{^}}test_barrier_local:
>> -; EG: GROUP_BARRIER
>> -
>> -; SI: buffer_store_dword
>> -; SI: s_waitcnt
>> -; SI: s_barrier
>> -
>> -define void @test_barrier_local(i32 addrspace(1)* %out) {
>> -entry:
>> -  %0 = call i32 @llvm.r600.read.tidig.x()
>> -  %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
>> -  store i32 %0, i32 addrspace(1)* %1
>> -  call void @llvm.AMDGPU.barrier.local()
>> -  %2 = call i32 @llvm.r600.read.local.size.x()
>> -  %3 = sub i32 %2, 1
>> -  %4 = sub i32 %3, %0
>> -  %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
>> -  %6 = load i32, i32 addrspace(1)* %5
>> -  store i32 %6, i32 addrspace(1)* %1
>> -  ret void
>> -}
>> -
>> -declare void @llvm.AMDGPU.barrier.local()
>> -
>> -declare i32 @llvm.r600.read.tidig.x() #0
>> -declare i32 @llvm.r600.read.local.size.x() #0
>> -
>> -attributes #0 = { readnone }
>>
>> Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll?rev=275870&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll (added)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll Mon Jul 18 13:34:59 2016
>> @@ -0,0 +1,31 @@
>> +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG  %s
>> +
>> +; EG-LABEL: {{^}}test_group_barrier:
>> +; EG: GROUP_BARRIER
>> +define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
>> +entry:
>> +  %tmp = call i32 @llvm.r600.read.tidig.x()
>> +  %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
>> +  store i32 %tmp, i32 addrspace(1)* %tmp1
>> +  call void @llvm.r600.group.barrier()
>> +  %tmp2 = call i32 @llvm.r600.read.local.size.x()
>> +  %tmp3 = sub i32 %tmp2, 1
>> +  %tmp4 = sub i32 %tmp3, %tmp
>> +  %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
>> +  %tmp6 = load i32, i32 addrspace(1)* %tmp5
>> +  store i32 %tmp6, i32 addrspace(1)* %tmp1
>> +  ret void
>> +}
>> +
>> +; Function Attrs: convergent nounwind
>> +declare void @llvm.r600.group.barrier() #1
>> +
>> +; Function Attrs: nounwind readnone
>> +declare i32 @llvm.r600.read.tidig.x() #2
>> +
>> +; Function Attrs: nounwind readnone
>> +declare i32 @llvm.r600.read.local.size.x() #2
>> +
>> +attributes #0 = { nounwind }
>> +attributes #1 = { convergent nounwind }
>> +attributes #2 = { nounwind readnone }
>>
>> Removed: llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll?rev=275869&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll (removed)
>> @@ -1,80 +0,0 @@
>> -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
>> -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
>> -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>> -
>> - at local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
>> - at local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
>> -
>> -
>> -; Check that the LDS size emitted correctly
>> -; EG: .long 166120
>> -; EG-NEXT: .long 8
>> -; GCN: .long 47180
>> -; GCN-NEXT: .long 32900
>> -
>> -
>> -; FUNC-LABEL: {{^}}local_memory_two_objects:
>> -
>> -; We would like to check the lds writes are using different
>> -; addresses, but due to variations in the scheduler, we can't do
>> -; this consistently on evergreen GPUs.
>> -; EG: LDS_WRITE
>> -; EG: LDS_WRITE
>> -
>> -; GROUP_BARRIER must be the last instruction in a clause
>> -; EG: GROUP_BARRIER
>> -; EG-NEXT: ALU clause
>> -
>> -; Make sure the lds reads are using different addresses, at different
>> -; constant offsets.
>> -; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
>> -; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
>> -
>> -
>> -; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
>> -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
>> -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
>> -
>> -
>> -; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
>> -
>> -; SI-DAG: ds_write_b32 [[ADDRW]],
>> -; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
>> -
>> -; GCN: s_barrier
>> -
>> -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
>> -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
>> -
>> -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
>> -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
>> -
>> -; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
>> -; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
>> -
>> -define void @local_memory_two_objects(i32 addrspace(1)* %out) {
>> -entry:
>> -  %x.i = call i32 @llvm.r600.read.tidig.x() #0
>> -  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
>> -  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
>> -  %mul = shl nsw i32 %x.i, 1
>> -  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
>> -  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
>> -  %sub = sub nsw i32 3, %x.i
>> -  call void @llvm.AMDGPU.barrier.local()
>> -  %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
>> -  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
>> -  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
>> -  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
>> -  %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
>> -  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
>> -  %add = add nsw i32 %x.i, 4
>> -  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
>> -  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
>> -  ret void
>> -}
>> -
>> -declare i32 @llvm.r600.read.tidig.x() #0
>> -declare void @llvm.AMDGPU.barrier.local()
>> -
>> -attributes #0 = { readnone }
>>
>> Copied: llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll (from r275869, llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll)
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll?p2=llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll&p1=llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll&r1=275869&r2=275870&rev=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll Mon Jul 18 13:34:59 2016
>> @@ -1,41 +1,52 @@
>> -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
>> -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
>> -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>> +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
>> +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
>> +
>> + at local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
>> +
>> +; Check that the LDS size emitted correctly
>> +; SI: .long 47180
>> +; SI-NEXT: .long 65668
>> +; CI: .long 47180
>> +; CI-NEXT: .long 32900
>> +
>> +; GCN-LABEL: {{^}}local_memory:
>> +
>> +; GCN-NOT: s_wqm_b64
>> +; GCN: ds_write_b32
>> +
>> +; GCN: s_barrier
>> +
>> +; GCN: ds_read_b32 {{v[0-9]+}},
>> +define void @local_memory(i32 addrspace(1)* %out) #0 {
>> +entry:
>> +  %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
>> +  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
>> +  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
>> +  %add = add nsw i32 %y.i, 1
>> +  %cmp = icmp eq i32 %add, 16
>> +  %.add = select i1 %cmp, i32 0, i32 %add
>> +  call void @llvm.amdgcn.s.barrier()
>> +  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
>> +  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
>> +  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
>> +  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
>> +  ret void
>> +}
>>
>> @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
>> @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
>>
>> -
>> ; Check that the LDS size emitted correctly
>> ; EG: .long 166120
>> ; EG-NEXT: .long 8
>> ; GCN: .long 47180
>> ; GCN-NEXT: .long 32900
>>
>> -
>> -; FUNC-LABEL: {{^}}local_memory_two_objects:
>> -
>> -; We would like to check the lds writes are using different
>> -; addresses, but due to variations in the scheduler, we can't do
>> -; this consistently on evergreen GPUs.
>> -; EG: LDS_WRITE
>> -; EG: LDS_WRITE
>> -
>> -; GROUP_BARRIER must be the last instruction in a clause
>> -; EG: GROUP_BARRIER
>> -; EG-NEXT: ALU clause
>> -
>> -; Make sure the lds reads are using different addresses, at different
>> -; constant offsets.
>> -; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
>> -; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
>> -
>> -
>> +; GCN-LABEL: {{^}}local_memory_two_objects:
>> ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
>> ; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
>> ; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
>>
>> -
>> ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
>>
>> ; SI-DAG: ds_write_b32 [[ADDRW]],
>> @@ -51,30 +62,31 @@
>>
>> ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
>> ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
>> -
>> -define void @local_memory_two_objects(i32 addrspace(1)* %out) {
>> +define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
>> entry:
>> -  %x.i = call i32 @llvm.r600.read.tidig.x() #0
>> +  %x.i = call i32 @llvm.amdgcn.workitem.id.x()
>>   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
>>   store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
>>   %mul = shl nsw i32 %x.i, 1
>>   %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
>>   store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
>>   %sub = sub nsw i32 3, %x.i
>> -  call void @llvm.AMDGPU.barrier.local()
>> +  call void @llvm.amdgcn.s.barrier()
>>   %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
>> -  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
>> +  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
>>   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
>> -  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
>> +  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
>>   %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
>> -  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
>> +  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
>>   %add = add nsw i32 %x.i, 4
>>   %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
>> -  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
>> +  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
>>   ret void
>> }
>>
>> -declare i32 @llvm.r600.read.tidig.x() #0
>> -declare void @llvm.AMDGPU.barrier.local()
>> +declare i32 @llvm.amdgcn.workitem.id.x() #1
>> +declare void @llvm.amdgcn.s.barrier() #2
>>
>> -attributes #0 = { readnone }
>> +attributes #0 = { nounwind }
>> +attributes #1 = { nounwind readnone }
>> +attributes #2 = { convergent nounwind }
>>
>> Modified: llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll Mon Jul 18 13:34:59 2016
>> @@ -1,57 +1,20 @@
>> -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
>> -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
>> +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
>> +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
>> ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>>
>> @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
>>
>> -
>> -; Check that the LDS size emitted correctly
>> -; EG: .long 166120
>> -; EG-NEXT: .long 128
>> -; SI: .long 47180
>> -; SI-NEXT: .long 65668
>> -; CI: .long 47180
>> -; CI-NEXT: .long 32900
>> -
>> -; FUNC-LABEL: {{^}}local_memory:
>> -
>> -; EG: LDS_WRITE
>> -; SI-NOT: s_wqm_b64
>> -; SI: ds_write_b32
>> -
>> -; GROUP_BARRIER must be the last instruction in a clause
>> -; EG: GROUP_BARRIER
>> -; EG-NEXT: ALU clause
>> -; SI: s_barrier
>> -
>> -; EG: LDS_READ_RET
>> -; SI: ds_read_b32 {{v[0-9]+}},
>> -
>> -define void @local_memory(i32 addrspace(1)* %out) {
>> -entry:
>> -  %y.i = call i32 @llvm.r600.read.tidig.x() #0
>> -  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
>> -  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
>> -  %add = add nsw i32 %y.i, 1
>> -  %cmp = icmp eq i32 %add, 16
>> -  %.add = select i1 %cmp, i32 0, i32 %add
>> -  call void @llvm.AMDGPU.barrier.local()
>> -  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
>> -  %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
>> -  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
>> -  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
>> -  ret void
>> -}
>> -
>> @lds = addrspace(3) global [512 x i32] undef, align 4
>>
>> -; On SI we need to make sure that the base offset is a register and not
>> -; an immediate.
>> +; On SI we need to make sure that the base offset is a register and
>> +; not an immediate.
>> +
>> ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
>> ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
>> ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
>> +
>> ; R600: LDS_READ_RET
>> -define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
>> +define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
>> entry:
>>   %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
>>   %tmp1 = load i32, i32 addrspace(3)* %tmp0
>> @@ -67,7 +30,7 @@ entry:
>> ; R600: LDS_READ_RET
>> ; GCN-DAG: ds_read_b32
>> ; GCN-DAG: ds_read2_b32
>> -define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
>> +define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
>>   %scalar = load i32, i32 addrspace(3)* %in
>>   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
>>   %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
>> @@ -78,7 +41,4 @@ define void @load_i32_v2i32_local(<2 x i
>>   ret void
>> }
>>
>> -declare i32 @llvm.r600.read.tidig.x() #0
>> -declare void @llvm.AMDGPU.barrier.local()
>> -
>> -attributes #0 = { readnone }
>> +attributes #0 = { nounwind }
>>
>> Copied: llvm/trunk/test/CodeGen/AMDGPU/local-memory.r600.ll (from r275869, llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll)
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-memory.r600.ll?p2=llvm/trunk/test/CodeGen/AMDGPU/local-memory.r600.ll&p1=llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll&r1=275869&r2=275870&rev=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/local-memory-two-objects.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.r600.ll Mon Jul 18 13:34:59 2016
>> @@ -1,18 +1,45 @@
>> -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
>> -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
>> ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
>>
>> + at local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
>> +
>> +; Check that the LDS size emitted correctly
>> +; EG: .long 166120
>> +; EG-NEXT: .long 128
>> +
>> +; FUNC-LABEL: {{^}}local_memory:
>> +
>> +; EG: LDS_WRITE
>> +
>> +; GROUP_BARRIER must be the last instruction in a clause
>> +; EG: GROUP_BARRIER
>> +; EG-NEXT: ALU clause
>> +
>> +; EG: LDS_READ_RET
>> +define void @local_memory(i32 addrspace(1)* %out) #0 {
>> +entry:
>> +  %y.i = call i32 @llvm.r600.read.tidig.x() #1
>> +  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
>> +  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
>> +  %add = add nsw i32 %y.i, 1
>> +  %cmp = icmp eq i32 %add, 16
>> +  %.add = select i1 %cmp, i32 0, i32 %add
>> +  call void @llvm.r600.group.barrier()
>> +  %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
>> +  %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
>> +  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
>> +  store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
>> +  ret void
>> +}
>> +
>> @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
>> @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
>>
>> -
>> ; Check that the LDS size emitted correctly
>> ; EG: .long 166120
>> ; EG-NEXT: .long 8
>> ; GCN: .long 47180
>> ; GCN-NEXT: .long 32900
>>
>> -
>> ; FUNC-LABEL: {{^}}local_memory_two_objects:
>>
>> ; We would like to check the lds writes are using different
>> @@ -30,51 +57,31 @@
>> ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
>> ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
>>
>> -
>> -; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
>> -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
>> -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
>> -
>> -
>> -; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
>> -
>> -; SI-DAG: ds_write_b32 [[ADDRW]],
>> -; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
>> -
>> -; GCN: s_barrier
>> -
>> -; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
>> -; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
>> -
>> -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
>> -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
>> -
>> -; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
>> -; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
>> -
>> -define void @local_memory_two_objects(i32 addrspace(1)* %out) {
>> +define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
>> entry:
>> -  %x.i = call i32 @llvm.r600.read.tidig.x() #0
>> +  %x.i = call i32 @llvm.r600.read.tidig.x() #1
>>   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
>>   store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
>>   %mul = shl nsw i32 %x.i, 1
>>   %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
>>   store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
>>   %sub = sub nsw i32 3, %x.i
>> -  call void @llvm.AMDGPU.barrier.local()
>> +  call void @llvm.r600.group.barrier()
>>   %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
>> -  %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
>> +  %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
>>   %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
>> -  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
>> +  store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
>>   %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
>> -  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
>> +  %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
>>   %add = add nsw i32 %x.i, 4
>>   %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
>> -  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
>> +  store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
>>   ret void
>> }
>>
>> -declare i32 @llvm.r600.read.tidig.x() #0
>> -declare void @llvm.AMDGPU.barrier.local()
>> +declare i32 @llvm.r600.read.tidig.x() #1
>> +declare void @llvm.r600.group.barrier() #2
>>
>> -attributes #0 = { readnone }
>> +attributes #0 = { nounwind }
>> +attributes #1 = { nounwind readnone }
>> +attributes #2 = { convergent nounwind }
>>
>> Modified: llvm/trunk/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll?rev=275870&r1=275869&r2=275870&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll (original)
>> +++ llvm/trunk/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll Mon Jul 18 13:34:59 2016
>> @@ -1,12 +1,9 @@
>> -; XFAIL: *
>> -; REQUIRES: asserts
>> -; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
>> -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
>> +; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
>> +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
>>
>> -declare void @llvm.AMDGPU.barrier.local() nounwind convergent
>> +declare void @llvm.amdgcn.s.barrier() nounwind convergent
>>
>> -
>> -; SI-LABEL: {{^}}main(
>> +; GCN-LABEL: {{^}}main:
>> define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
>> main_body:
>>   %0 = extractelement <4 x float> %reg1, i32 0
>> @@ -39,63 +36,63 @@ ENDIF:
>>   %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
>>   %15 = extractelement <4 x float> %reg1, i32 1
>>   %16 = extractelement <4 x float> %reg1, i32 3
>> -  %17 = load <4 x float>, <4 x float> addrspace(9)* null
>> +  %17 = load <4 x float>, <4 x float> addrspace(2)* null
>>   %18 = extractelement <4 x float> %17, i32 0
>>   %19 = fmul float %18, %0
>> -  %20 = load <4 x float>, <4 x float> addrspace(9)* null
>> +  %20 = load <4 x float>, <4 x float> addrspace(2)* null
>>   %21 = extractelement <4 x float> %20, i32 1
>>   %22 = fmul float %21, %0
>> -  %23 = load <4 x float>, <4 x float> addrspace(9)* null
>> +  %23 = load <4 x float>, <4 x float> addrspace(2)* null
>>   %24 = extractelement <4 x float> %23, i32 2
>>   %25 = fmul float %24, %0
>> -  %26 = load <4 x float>, <4 x float> addrspace(9)* null
>> +  %26 = load <4 x float>, <4 x float> addrspace(2)* null
>>   %27 = extractelement <4 x float> %26, i32 3
>>   %28 = fmul float %27, %0
>> -  %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
>> +  %29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
>>   %30 = extractelement <4 x float> %29, i32 0
>>   %31 = fmul float %30, %15
>>   %32 = fadd float %31, %19
>> -  %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
>> +  %33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
>>   %34 = extractelement <4 x float> %33, i32 1
>>   %35 = fmul float %34, %15
>>   %36 = fadd float %35, %22
>> -  %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
>> +  %37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
>>   %38 = extractelement <4 x float> %37, i32 2
>>   %39 = fmul float %38, %15
>>   %40 = fadd float %39, %25
>> -  %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
>> +  %41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
>>   %42 = extractelement <4 x float> %41, i32 3
>>   %43 = fmul float %42, %15
>>   %44 = fadd float %43, %28
>> -  %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
>> +  %45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
>>   %46 = extractelement <4 x float> %45, i32 0
>>   %47 = fmul float %46, %1
>>   %48 = fadd float %47, %32
>> -  %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
>> +  %49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
>>   %50 = extractelement <4 x float> %49, i32 1
>>   %51 = fmul float %50, %1
>>   %52 = fadd float %51, %36
>> -  %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
>> +  %53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
>>   %54 = extractelement <4 x float> %53, i32 2
>>   %55 = fmul float %54, %1
>>   %56 = fadd float %55, %40
>> -  %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
>> +  %57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
>>   %58 = extractelement <4 x float> %57, i32 3
>>   %59 = fmul float %58, %1
>>   %60 = fadd float %59, %44
>> -  %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
>> +  %61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
>>   %62 = extractelement <4 x float> %61, i32 0
>>   %63 = fmul float %62, %16
>>   %64 = fadd float %63, %48
>> -  %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
>> +  %65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
>>   %66 = extractelement <4 x float> %65, i32 1
>>   %67 = fmul float %66, %16
>>   %68 = fadd float %67, %52
>> -  %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
>> +  %69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
>>   %70 = extractelement <4 x float> %69, i32 2
>>   %71 = fmul float %70, %16
>>   %72 = fadd float %71, %56
>> -  %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
>> +  %73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
>>   %74 = extractelement <4 x float> %73, i32 3
>>   %75 = fmul float %74, %16
>>   %76 = fadd float %75, %60
>> @@ -103,12 +100,12 @@ ENDIF:
>>   %78 = insertelement <4 x float> %77, float %68, i32 1
>>   %79 = insertelement <4 x float> %78, float %72, i32 2
>>   %80 = insertelement <4 x float> %79, float %76, i32 3
>> -  call void @llvm.AMDGPU.barrier.local()
>> +  call void @llvm.amdgcn.s.barrier()
>>   %81 = insertelement <4 x float> undef, float %temp.0, i32 0
>>   %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
>>   %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
>>   %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
>> -  call void @llvm.AMDGPU.barrier.local()
>> +  call void @llvm.amdgcn.s.barrier()
>>   ret void
>>
>> LOOP:                                             ; preds = %main_body, %Flow
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits