[llvm] r277504 - AMDGPU: Stay in WQM for non-intrinsic stores

Wed Aug 3 12:04:57 PDT 2016

Thanks on both counts!

On 03.08.2016 20:20, Hans Wennborg wrote:
> Merged in r277620.
>
> Thanks,
> Hans
>
> On Wed, Aug 3, 2016 at 10:50 AM, Tom Stellard <tom at stellard.net> wrote:
>> On Wed, Aug 03, 2016 at 09:04:55AM -0700, Hans Wennborg wrote:
>>> Sounds good to me if Tom approves.
>>>
>>
>> This is fine with me.
>>
>> -Tom
>>
>>> On Wed, Aug 3, 2016 at 5:08 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>>>> Hi Hans,
>>>>
>>>> this a bugfix that should go into the 3.9 release branch.
>>>>
>>>> Thanks,
>>>> Nicolai
>>>>
>>>>
>>>> On 02.08.2016 21:31, Nicolai Haehnle via llvm-commits wrote:
>>>>>
>>>>> Author: nha
>>>>> Date: Tue Aug  2 14:31:14 2016
>>>>> New Revision: 277504
>>>>>
>>>>> URL: http://llvm.org/viewvc/llvm-project?rev=277504&view=rev
>>>>> Log:
>>>>> AMDGPU: Stay in WQM for non-intrinsic stores
>>>>>
>>>>> Summary:
>>>>> Two types of stores are possible in pixel shaders: stores to memory that
>>>>> are
>>>>> explicitly requested at the API level, and stores that are an
>>>>> implementation
>>>>> detail of register spilling or lowering of arrays.
>>>>>
>>>>> For the first kind of store, we must ensure that helper pixels have no
>>>>> effect
>>>>> and hence WQM must be disabled. The second kind of store must always be
>>>>> executed, because the written value may be loaded again in a way that is
>>>>> relevant for helper pixels as well -- and there are no externally visible
>>>>> effects anyway.
>>>>>
>>>>> This is a candidate for the 3.9 release branch.
>>>>>
>>>>> Reviewers: arsenm, tstellarAMD, mareko
>>>>>
>>>>> Subscribers: arsenm, kzhuravl, llvm-commits
>>>>>
>>>>> Differential Revision: https://reviews.llvm.org/D22675
>>>>>
>>>>> Modified:
>>>>>     llvm/trunk/lib/Target/AMDGPU/SIDefines.h
>>>>>     llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
>>>>>     llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
>>>>>     llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
>>>>>     llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
>>>>>     llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
>>>>>     llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll
>>>>>     llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
>>>>>
>>>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIDefines.h
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIDefines.h?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/lib/Target/AMDGPU/SIDefines.h (original)
>>>>> +++ llvm/trunk/lib/Target/AMDGPU/SIDefines.h Tue Aug  2 14:31:14 2016
>>>>> @@ -41,7 +41,8 @@ enum {
>>>>>    WQM = 1 << 22,
>>>>>    VGPRSpill = 1 << 23,
>>>>>    VOPAsmPrefer32Bit = 1 << 24,
>>>>> -  Gather4 = 1 << 25
>>>>> +  Gather4 = 1 << 25,
>>>>> +  DisableWQM = 1 << 26
>>>>>  };
>>>>>  }
>>>>>
>>>>>
>>>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td (original)
>>>>> +++ llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td Tue Aug  2 14:31:14
>>>>> 2016
>>>>> @@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string
>>>>>    field bits<1> DS = 0;
>>>>>    field bits<1> MIMG = 0;
>>>>>    field bits<1> FLAT = 0;
>>>>> +
>>>>> +  // Whether WQM _must_ be enabled for this instruction.
>>>>>    field bits<1> WQM = 0;
>>>>>    field bits<1> VGPRSpill = 0;
>>>>>
>>>>> @@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string
>>>>>
>>>>>    field bits<1> Gather4 = 0;
>>>>>
>>>>> +  // Whether WQM _must_ be disabled for this instruction.
>>>>> +  field bits<1> DisableWQM = 0;
>>>>> +
>>>>>    // These need to be kept in sync with the enum in SIInstrFlags.
>>>>>    let TSFlags{0} = VM_CNT;
>>>>>    let TSFlags{1} = EXP_CNT;
>>>>> @@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string
>>>>>    let TSFlags{23} = VGPRSpill;
>>>>>    let TSFlags{24} = VOPAsmPrefer32Bit;
>>>>>    let TSFlags{25} = Gather4;
>>>>> +  let TSFlags{26} = DisableWQM;
>>>>>
>>>>>    let SchedRW = [Write32Bit];
>>>>>
>>>>>
>>>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
>>>>> +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Tue Aug  2 14:31:14 2016
>>>>> @@ -340,6 +340,14 @@ public:
>>>>>      return get(Opcode).TSFlags & SIInstrFlags::WQM;
>>>>>    }
>>>>>
>>>>> +  static bool isDisableWQM(const MachineInstr &MI) {
>>>>> +    return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
>>>>> +  }
>>>>> +
>>>>> +  bool isDisableWQM(uint16_t Opcode) const {
>>>>> +    return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
>>>>> +  }
>>>>> +
>>>>>    static bool isVGPRSpill(const MachineInstr &MI) {
>>>>>      return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
>>>>>    }
>>>>>
>>>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
>>>>> +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Tue Aug  2 14:31:14 2016
>>>>> @@ -2723,6 +2723,10 @@ multiclass MUBUF_m <mubuf op, string opN
>>>>>    def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
>>>>>             MUBUFAddr64Table <0>;
>>>>>
>>>>> +  let DisableWQM = 1 in {
>>>>> +    def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
>>>>> +  }
>>>>> +
>>>>>    let addr64 = 0, isCodeGenOnly = 0 in {
>>>>>      def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
>>>>>    }
>>>>> @@ -2793,7 +2797,8 @@ multiclass MUBUFAtomicOther_m <mubuf op,
>>>>>  multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
>>>>>                           ValueType vt, SDPatternOperator atomic> {
>>>>>
>>>>> -  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1
>>>>> in {
>>>>> +  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
>>>>> +      DisableWQM = 1 in {
>>>>>
>>>>>      // No return variants
>>>>>      let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
>>>>> @@ -3197,6 +3202,7 @@ class MIMG_Store_Helper <bits<7> op, str
>>>>>    let mayStore = 1;
>>>>>    let hasSideEffects = 1;
>>>>>    let hasPostISelHook = 0;
>>>>> +  let DisableWQM = 1;
>>>>>  }
>>>>>
>>>>>  multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
>>>>> @@ -3228,6 +3234,7 @@ class MIMG_Atomic_Helper <string asm, Re
>>>>>    let mayStore = 1;
>>>>>    let hasSideEffects = 1;
>>>>>    let hasPostISelHook = 0;
>>>>> +  let DisableWQM = 1;
>>>>>    let Constraints = "$vdst = $vdata";
>>>>>    let AsmMatchConverter = "cvtMIMGAtomic";
>>>>>  }
>>>>>
>>>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
>>>>> +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Tue Aug  2 14:31:14
>>>>> 2016
>>>>> @@ -2050,7 +2050,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
>>>>>      (name vt:$vdata, v4i32:$rsrc, 0,
>>>>>            (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
>>>>>            imm:$glc, imm:$slc),
>>>>> -    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm
>>>>> $offset),
>>>>> +    (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
>>>>> (as_i16imm $offset),
>>>>>                                      (as_i1imm $glc), (as_i1imm $slc), 0)
>>>>>    >;
>>>>>
>>>>> @@ -2058,7 +2058,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
>>>>>      (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
>>>>>            (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
>>>>>            imm:$glc, imm:$slc),
>>>>> -    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
>>>>> +    (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc,
>>>>> $soffset,
>>>>>                                     (as_i16imm $offset), (as_i1imm $glc),
>>>>>                                     (as_i1imm $slc), 0)
>>>>>    >;
>>>>> @@ -2067,7 +2067,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
>>>>>      (name vt:$vdata, v4i32:$rsrc, 0,
>>>>>            (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset,
>>>>> i32:$voffset),
>>>>>            imm:$glc, imm:$slc),
>>>>> -    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
>>>>> +    (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc,
>>>>> $soffset,
>>>>>                                     (as_i16imm $offset), (as_i1imm $glc),
>>>>>                                     (as_i1imm $slc), 0)
>>>>>    >;
>>>>> @@ -2076,7 +2076,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
>>>>>      (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
>>>>>            (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset,
>>>>> i32:$voffset),
>>>>>            imm:$glc, imm:$slc),
>>>>> -    (!cast<MUBUF>(opcode # _BOTHEN)
>>>>> +    (!cast<MUBUF>(opcode # _BOTHEN_exact)
>>>>>        $vdata,
>>>>>        (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
>>>>>        $rsrc, $soffset, (as_i16imm $offset),
>>>>>
>>>>> Modified: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
>>>>> +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp Tue Aug  2 14:31:14
>>>>> 2016
>>>>> @@ -185,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(M
>>>>>
>>>>>        if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
>>>>>          Flags = StateWQM;
>>>>> -      } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
>>>>> +      } else if (TII->isDisableWQM(MI)) {
>>>>>          Flags = StateExact;
>>>>>        } else {
>>>>>          // Handle export instructions with the exec mask valid flag set
>>>>> @@ -237,9 +237,10 @@ void SIWholeQuadMode::propagateInstructi
>>>>>    InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling
>>>>> references
>>>>>    BlockInfo &BI = Blocks[MBB];
>>>>>
>>>>> -  // Control flow-type instructions that are followed by WQM computations
>>>>> -  // must themselves be in WQM.
>>>>> -  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) &&
>>>>> MI.isTerminator()) {
>>>>> +  // Control flow-type instructions and stores to temporary memory that
>>>>> are
>>>>> +  // followed by WQM computations must themselves be in WQM.
>>>>> +  if ((II.OutNeeds & StateWQM) && !II.Needs &&
>>>>> +      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
>>>>>      Instructions[&MI].Needs = StateWQM;
>>>>>      II.Needs = StateWQM;
>>>>>    }
>>>>>
>>>>> Modified: llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll (original)
>>>>> +++ llvm/trunk/test/CodeGen/AMDGPU/skip-if-dead.ll Tue Aug  2 14:31:14
>>>>> 2016
>>>>> @@ -348,7 +348,6 @@ bb7:
>>>>>  ; CHECK: image_sample_c
>>>>>
>>>>>  ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
>>>>> -; CHECK: s_and_b64 exec, exec,
>>>>>  ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
>>>>>  ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
>>>>>  ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
>>>>> @@ -385,6 +384,7 @@ bb9:
>>>>>
>>>>>  declare void @llvm.AMDGPU.kill(float) #0
>>>>>  declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>,
>>>>> <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
>>>>> +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32,
>>>>> i1, i1) nounwind
>>>>>
>>>>>  attributes #0 = { nounwind }
>>>>>  attributes #1 = { nounwind readnone }
>>>>> \ No newline at end of file
>>>>>
>>>>> Modified: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
>>>>> URL:
>>>>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wqm.ll?rev=277504&r1=277503&r2=277504&view=diff
>>>>>
>>>>> ==============================================================================
>>>>> --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll (original)
>>>>> +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll Tue Aug  2 14:31:14 2016
>>>>> @@ -41,14 +41,14 @@ main_body:
>>>>>  ;CHECK: store
>>>>>  ;CHECK-NOT: exec
>>>>>  ;CHECK: .size test3
>>>>> -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32>
>>>>> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
>>>>> +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32>
>>>>> inreg %sampler, <4 x i32> %c) {
>>>>>  main_body:
>>>>>    %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x
>>>>> i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0, i32 0)
>>>>>    %tex.1 = bitcast <4 x float> %tex to <4 x i32>
>>>>>    %tex.2 = extractelement <4 x i32> %tex.1, i32 0
>>>>> -  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
>>>>> -  %wr = extractelement <4 x float> %tex, i32 1
>>>>> -  store float %wr, float addrspace(1)* %gep
>>>>> +
>>>>> +  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32>
>>>>> undef, i32 %tex.2, i32 0, i1 0, i1 0)
>>>>> +
>>>>>    ret <4 x float> %tex
>>>>>  }
>>>>>
>>>>> @@ -66,8 +66,9 @@ main_body:
>>>>>  define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32>
>>>>> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data)
>>>>> {
>>>>>  main_body:
>>>>>    %c.1 = mul i32 %c, %d
>>>>> -  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
>>>>> -  store float %data, float addrspace(1)* %gep
>>>>> +
>>>>> +  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32>
>>>>> undef, i32 %c.1, i32 0, i1 0, i1 0)
>>>>> +
>>>>>    %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32>
>>>>> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0)
>>>>>    ret <4 x float> %tex
>>>>>  }
>>>>> @@ -89,7 +90,7 @@ main_body:
>>>>>  ;CHECK: s_mov_b64 exec, [[SAVED]]
>>>>>  ;CHECK: %IF
>>>>>  ;CHECK: image_sample
>>>>> -define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float
>>>>> %data) {
>>>>> +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, i32 %c, i32 %z, float %data) {
>>>>>  main_body:
>>>>>    %cmp = icmp eq i32 %z, 0
>>>>>    br i1 %cmp, label %IF, label %ELSE
>>>>> @@ -100,8 +101,7 @@ IF:
>>>>>    br label %END
>>>>>
>>>>>  ELSE:
>>>>> -  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
>>>>> -  store float %data, float addrspace(1)* %gep
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef,
>>>>> i32 %c, i32 0, i1 0, i1 0)
>>>>>    br label %END
>>>>>
>>>>>  END:
>>>>> @@ -129,7 +129,7 @@ END:
>>>>>  ;CHECK: s_or_b64 exec, exec,
>>>>>  ;CHECK: v_mov_b32_e32 v0
>>>>>  ;CHECK: ; return
>>>>> -define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float
>>>>> %data) {
>>>>> +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, i32 %c, i32 %z, float %data) {
>>>>>  main_body:
>>>>>    %cmp = icmp eq i32 %z, 0
>>>>>    br i1 %cmp, label %ELSE, label %IF
>>>>> @@ -140,8 +140,7 @@ IF:
>>>>>    br label %END
>>>>>
>>>>>  ELSE:
>>>>> -  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
>>>>> -  store float %data, float addrspace(1)* %gep
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef,
>>>>> i32 %c, i32 0, i1 0, i1 0)
>>>>>    br label %END
>>>>>
>>>>>  END:
>>>>> @@ -163,23 +162,20 @@ END:
>>>>>  ;CHECK: store
>>>>>  ;CHECK: s_wqm_b64 exec, exec
>>>>>  ;CHECK: v_cmp
>>>>> -define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc,
>>>>> <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2
>>>>> x float> %data, i32 %coord) {
>>>>> +define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc,
>>>>> <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
>>>>>  main_body:
>>>>>    %idx.1 = extractelement <3 x i32> %idx, i32 0
>>>>> -  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
>>>>>    %data.1 = extractelement <2 x float> %data, i32 0
>>>>> -  store float %data.1, float addrspace(1)* %gep.1
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef,
>>>>> i32 %idx.1, i32 0, i1 0, i1 0)
>>>>>
>>>>>    ; The load that determines the branch (and should therefore be WQM) is
>>>>>    ; surrounded by stores that require disabled WQM.
>>>>>    %idx.2 = extractelement <3 x i32> %idx, i32 1
>>>>> -  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
>>>>> -  %z = load float, float addrspace(1)* %gep.2
>>>>> +  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32
>>>>> %idx.2, i32 0, i1 0, i1 0)
>>>>>
>>>>>    %idx.3 = extractelement <3 x i32> %idx, i32 2
>>>>> -  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
>>>>>    %data.3 = extractelement <2 x float> %data, i32 1
>>>>> -  store float %data.3, float addrspace(1)* %gep.3
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef,
>>>>> i32 %idx.3, i32 0, i1 0, i1 0)
>>>>>
>>>>>    %cc = fcmp ogt float %z, 0.0
>>>>>    br i1 %cc, label %IF, label %ELSE
>>>>> @@ -210,24 +206,21 @@ END:
>>>>>  ;CHECK: load
>>>>>  ;CHECK: store
>>>>>  ;CHECK: v_cmp
>>>>> -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x
>>>>> float> %data, i32 %coord) {
>>>>> +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
>>>>>  main_body:
>>>>>    %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32>
>>>>> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0)
>>>>>    %tex.1 = extractelement <4 x float> %tex, i32 0
>>>>>
>>>>>    %idx.1 = extractelement <3 x i32> %idx, i32 0
>>>>> -  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
>>>>>    %data.1 = extractelement <2 x float> %data, i32 0
>>>>> -  store float %data.1, float addrspace(1)* %gep.1
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef,
>>>>> i32 %idx.1, i32 0, i1 0, i1 0)
>>>>>
>>>>>    %idx.2 = extractelement <3 x i32> %idx, i32 1
>>>>> -  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
>>>>> -  %z = load float, float addrspace(1)* %gep.2
>>>>> +  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32
>>>>> %idx.2, i32 0, i1 0, i1 0)
>>>>>
>>>>>    %idx.3 = extractelement <3 x i32> %idx, i32 2
>>>>> -  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
>>>>>    %data.3 = extractelement <2 x float> %data, i32 1
>>>>> -  store float %data.3, float addrspace(1)* %gep.3
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef,
>>>>> i32 %idx.3, i32 0, i1 0, i1 0)
>>>>>
>>>>>    %cc = fcmp ogt float %z, 0.0
>>>>>    br i1 %cc, label %IF, label %ELSE
>>>>> @@ -258,15 +251,14 @@ END:
>>>>>  ;CHECK: s_mov_b64 exec, [[SAVE]]
>>>>>  ;CHECK: %END
>>>>>  ;CHECK: image_sample
>>>>> -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc,
>>>>> <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32
>>>>> %y, float %z) {
>>>>> +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc,
>>>>> <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
>>>>>  main_body:
>>>>>    %cond = icmp eq i32 %y, 0
>>>>>    br i1 %cond, label %IF, label %END
>>>>>
>>>>>  IF:
>>>>> -  %data = load float, float addrspace(1)* %ptr
>>>>> -  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
>>>>> -  store float %data, float addrspace(1)* %gep
>>>>> +  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0,
>>>>> i32 0, i1 0, i1 0)
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef,
>>>>> i32 1, i32 0, i1 0, i1 0)
>>>>>    br label %END
>>>>>
>>>>>  END:
>>>>> @@ -282,13 +274,11 @@ END:
>>>>>  ;CHECK-NEXT: s_wqm_b64 exec, exec
>>>>>  ;CHECK: image_sample
>>>>>  ;CHECK: s_and_b64 exec, exec, [[ORIG]]
>>>>> -;SI: buffer_store_dword
>>>>> -;VI: flat_store_dword
>>>>> +;CHECK: buffer_store_dword
>>>>>  ;CHECK: s_wqm_b64 exec, exec
>>>>>  ;CHECK: v_cmpx_
>>>>>  ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
>>>>> -;SI: buffer_store_dword
>>>>> -;VI: flat_store_dword
>>>>> +;CHECK: buffer_store_dword
>>>>>  ;CHECK: s_mov_b64 exec, [[SAVE]]
>>>>>  ;CHECK: image_sample
>>>>>  define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x
>>>>> float> %data, i32 %coord, i32 %coord2, float %z) {
>>>>> @@ -296,16 +286,14 @@ main_body:
>>>>>    %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32>
>>>>> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0)
>>>>>
>>>>>    %idx.0 = extractelement <2 x i32> %idx, i32 0
>>>>> -  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
>>>>>    %data.0 = extractelement <2 x float> %data, i32 0
>>>>> -  store float %data.0, float addrspace(1)* %gep.0
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef,
>>>>> i32 %idx.0, i32 0, i1 0, i1 0)
>>>>>
>>>>>    call void @llvm.AMDGPU.kill(float %z)
>>>>>
>>>>>    %idx.1 = extractelement <2 x i32> %idx, i32 1
>>>>> -  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
>>>>>    %data.1 = extractelement <2 x float> %data, i32 1
>>>>> -  store float %data.1, float addrspace(1)* %gep.1
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef,
>>>>> i32 %idx.1, i32 0, i1 0, i1 0)
>>>>>
>>>>>    %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x
>>>>> i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0, i32 0)
>>>>>    %out = fadd <4 x float> %tex, %tex2
>>>>> @@ -321,16 +309,14 @@ main_body:
>>>>>  ; CHECK: s_wqm_b64 exec, exec
>>>>>  ; CHECK: image_sample
>>>>>  ; CHECK: s_and_b64 exec, exec, [[ORIG]]
>>>>> -; SI: buffer_store_dword
>>>>> -; VI: flat_store_dword
>>>>> +; CHECK: buffer_store_dword
>>>>>  ; CHECK-NOT: wqm
>>>>>  ; CHECK: v_cmpx_
>>>>> -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data,
>>>>> i32 %coord, i32 %coord2, float %z) {
>>>>> +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x
>>>>> i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float
>>>>> %z) {
>>>>>  main_body:
>>>>>    %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32>
>>>>> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0)
>>>>>
>>>>> -  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
>>>>> -  store float %data, float addrspace(1)* %gep
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef,
>>>>> i32 0, i32 0, i1 0, i1 0)
>>>>>
>>>>>    call void @llvm.AMDGPU.kill(float %z)
>>>>>
>>>>> @@ -388,9 +374,53 @@ break:
>>>>>    ret <4 x float> %c.iv
>>>>>  }
>>>>>
>>>>> +; Only intrinsic stores need exact execution -- other stores do not have
>>>>> +; externally visible effects and may require WQM for correctness.
>>>>> +;
>>>>> +; CHECK-LABEL: {{^}}test_alloca:
>>>>> +; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
>>>>> +; CHECK: s_wqm_b64 exec, exec
>>>>> +
>>>>> +; CHECK: s_and_b64 exec, exec, [[LIVE]]
>>>>> +; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
>>>>> +; CHECK: s_wqm_b64 exec, exec
>>>>> +; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}},
>>>>> {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
>>>>> +; CHECK: s_and_b64 exec, exec, [[LIVE]]
>>>>> +; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}},
>>>>> {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
>>>>> +; CHECK: s_wqm_b64 exec, exec
>>>>> +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}},
>>>>> {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
>>>>> +
>>>>> +; CHECK: image_sample
>>>>> +; CHECK: s_and_b64 exec, exec, [[LIVE]]
>>>>> +; CHECK: buffer_store_dwordx4
>>>>> +define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx)
>>>>> nounwind {
>>>>> +entry:
>>>>> +  %array = alloca [32 x i32], align 4
>>>>> +
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef,
>>>>> i32 0, i32 0, i1 0, i1 0)
>>>>> +
>>>>> +  %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
>>>>> +  store volatile i32 %a, i32* %s.gep, align 4
>>>>> +
>>>>> +  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef,
>>>>> i32 1, i32 0, i1 0, i1 0)
>>>>> +
>>>>> +  %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
>>>>> +  %c = load i32, i32* %c.gep, align 4
>>>>> +
>>>>> +  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32>
>>>>> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,
>>>>> i32 0)
>>>>> +
>>>>> +  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32>
>>>>> undef, i32 0, i32 0, i1 0, i1 0)
>>>>> +
>>>>> +  ret void
>>>>> +}
>>>>> +
>>>>> +
>>>>>  declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x
>>>>> i32>, i32, i1, i1, i1, i1) #1
>>>>> +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32,
>>>>> i1, i1) #1
>>>>> +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32,
>>>>> i32, i1, i1) #1
>>>>>
>>>>>  declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>,
>>>>> i32, i1, i1, i1, i1) #2
>>>>> +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1)
>>>>> #2
>>>>>
>>>>>  declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>,
>>>>> i32, i32, i32, i32, i32, i32, i32, i32) #3
>>>>>  declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4
>>>>> x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> llvm-commits mailing list
>>>>> llvm-commits at lists.llvm.org
>>>>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>>>>
>>>>