[llvm-branch-commits] [llvm-branch] r277620 - Merging r277504:
Hans Wennborg via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Aug 3 11:13:01 PDT 2016
Author: hans
Date: Wed Aug 3 13:13:01 2016
New Revision: 277620
URL: http://llvm.org/viewvc/llvm-project?rev=277620&view=rev
Log:
Merging r277504:
------------------------------------------------------------------------
r277504 | nha | 2016-08-02 12:31:14 -0700 (Tue, 02 Aug 2016) | 20 lines
AMDGPU: Stay in WQM for non-intrinsic stores
Summary:
Two types of stores are possible in pixel shaders: stores to memory that are
explicitly requested at the API level, and stores that are an implementation
detail of register spilling or lowering of arrays.
For the first kind of store, we must ensure that helper pixels have no effect
and hence WQM must be disabled. The second kind of store must always be
executed, because the written value may be loaded again in a way that is
relevant for helper pixels as well -- and there are no externally visible
effects anyway.
This is a candidate for the 3.9 release branch.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, kzhuravl, llvm-commits
Differential Revision: https://reviews.llvm.org/D22675
------------------------------------------------------------------------
Modified:
llvm/branches/release_39/ (props changed)
llvm/branches/release_39/lib/Target/AMDGPU/SIDefines.h
llvm/branches/release_39/lib/Target/AMDGPU/SIInstrFormats.td
llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.h
llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.td
llvm/branches/release_39/lib/Target/AMDGPU/SIInstructions.td
llvm/branches/release_39/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/branches/release_39/test/CodeGen/AMDGPU/skip-if-dead.ll
llvm/branches/release_39/test/CodeGen/AMDGPU/wqm.ll
Propchange: llvm/branches/release_39/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Wed Aug 3 13:13:01 2016
@@ -1,3 +1,3 @@
/llvm/branches/Apple/Pertwee:110850,110961
/llvm/branches/type-system-rewrite:133420-134817
-/llvm/trunk:155241,275868-275870,275879,275898,275928,275935,275946,275978,275981,276015,276077,276109,276119,276181,276209,276236-276237,276358,276364,276368,276389,276435,276438,276479,276510,276648,276740,276956,276980,277114,277135,277371,277500
+/llvm/trunk:155241,275868-275870,275879,275898,275928,275935,275946,275978,275981,276015,276077,276109,276119,276181,276209,276236-276237,276358,276364,276368,276389,276435,276438,276479,276510,276648,276740,276956,276980,277114,277135,277371,277500,277504
Modified: llvm/branches/release_39/lib/Target/AMDGPU/SIDefines.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/AMDGPU/SIDefines.h?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/AMDGPU/SIDefines.h (original)
+++ llvm/branches/release_39/lib/Target/AMDGPU/SIDefines.h Wed Aug 3 13:13:01 2016
@@ -41,7 +41,8 @@ enum {
WQM = 1 << 22,
VGPRSpill = 1 << 23,
VOPAsmPrefer32Bit = 1 << 24,
- Gather4 = 1 << 25
+ Gather4 = 1 << 25,
+ DisableWQM = 1 << 26
};
}
Modified: llvm/branches/release_39/lib/Target/AMDGPU/SIInstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/AMDGPU/SIInstrFormats.td?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/AMDGPU/SIInstrFormats.td (original)
+++ llvm/branches/release_39/lib/Target/AMDGPU/SIInstrFormats.td Wed Aug 3 13:13:01 2016
@@ -41,6 +41,8 @@ class InstSI <dag outs, dag ins, string
field bits<1> DS = 0;
field bits<1> MIMG = 0;
field bits<1> FLAT = 0;
+
+ // Whether WQM _must_ be enabled for this instruction.
field bits<1> WQM = 0;
field bits<1> VGPRSpill = 0;
@@ -50,6 +52,9 @@ class InstSI <dag outs, dag ins, string
field bits<1> Gather4 = 0;
+ // Whether WQM _must_ be disabled for this instruction.
+ field bits<1> DisableWQM = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = VM_CNT;
let TSFlags{1} = EXP_CNT;
@@ -81,6 +86,7 @@ class InstSI <dag outs, dag ins, string
let TSFlags{23} = VGPRSpill;
let TSFlags{24} = VOPAsmPrefer32Bit;
let TSFlags{25} = Gather4;
+ let TSFlags{26} = DisableWQM;
let SchedRW = [Write32Bit];
Modified: llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.h?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.h Wed Aug 3 13:13:01 2016
@@ -340,6 +340,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::WQM;
}
+ static bool isDisableWQM(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::DisableWQM;
+ }
+
+ bool isDisableWQM(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::DisableWQM;
+ }
+
static bool isVGPRSpill(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill;
}
Modified: llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.td?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/branches/release_39/lib/Target/AMDGPU/SIInstrInfo.td Wed Aug 3 13:13:01 2016
@@ -2949,6 +2949,10 @@ multiclass MUBUF_m <mubuf op, string opN
def "" : MUBUF_Pseudo <opName, outs, ins, pattern>,
MUBUFAddr64Table <0>;
+ let DisableWQM = 1 in {
+ def "_exact" : MUBUF_Pseudo <opName, outs, ins, []>;
+ }
+
let addr64 = 0, isCodeGenOnly = 0 in {
def _si : MUBUF_Real_si <op, opName, outs, ins, asm>;
}
@@ -3019,7 +3023,8 @@ multiclass MUBUFAtomicOther_m <mubuf op,
multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc,
ValueType vt, SDPatternOperator atomic> {
- let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in {
+ let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1,
+ DisableWQM = 1 in {
// No return variants
let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in {
@@ -3423,6 +3428,7 @@ class MIMG_Store_Helper <bits<7> op, str
let mayStore = 1;
let hasSideEffects = 1;
let hasPostISelHook = 0;
+ let DisableWQM = 1;
}
multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm,
@@ -3454,6 +3460,7 @@ class MIMG_Atomic_Helper <string asm, Re
let mayStore = 1;
let hasSideEffects = 1;
let hasPostISelHook = 0;
+ let DisableWQM = 1;
let Constraints = "$vdst = $vdata";
let AsmMatchConverter = "cvtMIMGAtomic";
}
Modified: llvm/branches/release_39/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/AMDGPU/SIInstructions.td?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/branches/release_39/lib/Target/AMDGPU/SIInstructions.td Wed Aug 3 13:13:01 2016
@@ -2200,7 +2200,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+ (!cast<MUBUF>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
(as_i1imm $glc), (as_i1imm $slc), 0)
>;
@@ -2208,7 +2208,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+ (!cast<MUBUF>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $glc),
(as_i1imm $slc), 0)
>;
@@ -2217,7 +2217,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
(name vt:$vdata, v4i32:$rsrc, 0,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+ (!cast<MUBUF>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i1imm $glc),
(as_i1imm $slc), 0)
>;
@@ -2226,7 +2226,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPat
(name vt:$vdata, v4i32:$rsrc, i32:$vindex,
(MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
imm:$glc, imm:$slc),
- (!cast<MUBUF>(opcode # _BOTHEN)
+ (!cast<MUBUF>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
Modified: llvm/branches/release_39/lib/Target/AMDGPU/SIWholeQuadMode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
+++ llvm/branches/release_39/lib/Target/AMDGPU/SIWholeQuadMode.cpp Wed Aug 3 13:13:01 2016
@@ -185,7 +185,7 @@ char SIWholeQuadMode::scanInstructions(M
if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
Flags = StateWQM;
- } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
+ } else if (TII->isDisableWQM(MI)) {
Flags = StateExact;
} else {
// Handle export instructions with the exec mask valid flag set
@@ -237,9 +237,10 @@ void SIWholeQuadMode::propagateInstructi
InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
BlockInfo &BI = Blocks[MBB];
- // Control flow-type instructions that are followed by WQM computations
- // must themselves be in WQM.
- if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
+ // Control flow-type instructions and stores to temporary memory that are
+ // followed by WQM computations must themselves be in WQM.
+ if ((II.OutNeeds & StateWQM) && !II.Needs &&
+ (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;
}
Modified: llvm/branches/release_39/test/CodeGen/AMDGPU/skip-if-dead.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/AMDGPU/skip-if-dead.ll?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/AMDGPU/skip-if-dead.ll (original)
+++ llvm/branches/release_39/test/CodeGen/AMDGPU/skip-if-dead.ll Wed Aug 3 13:13:01 2016
@@ -348,7 +348,6 @@ bb7:
; CHECK: image_sample_c
; CHECK: v_cmp_neq_f32_e32 vcc, 0,
-; CHECK: s_and_b64 exec, exec,
; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
@@ -385,6 +384,7 @@ bb9:
declare void @llvm.AMDGPU.kill(float) #0
declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
\ No newline at end of file
Modified: llvm/branches/release_39/test/CodeGen/AMDGPU/wqm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/branches/release_39/test/CodeGen/AMDGPU/wqm.ll?rev=277620&r1=277619&r2=277620&view=diff
==============================================================================
--- llvm/branches/release_39/test/CodeGen/AMDGPU/wqm.ll (original)
+++ llvm/branches/release_39/test/CodeGen/AMDGPU/wqm.ll Wed Aug 3 13:13:01 2016
@@ -41,14 +41,14 @@ main_body:
;CHECK: store
;CHECK-NOT: exec
;CHECK: .size test3
-define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tex.1 = bitcast <4 x float> %tex to <4 x i32>
%tex.2 = extractelement <4 x i32> %tex.1, i32 0
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
- %wr = extractelement <4 x float> %tex, i32 1
- store float %wr, float addrspace(1)* %gep
+
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
+
ret <4 x float> %tex
}
@@ -66,8 +66,9 @@ main_body:
define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
main_body:
%c.1 = mul i32 %c, %d
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
- store float %data, float addrspace(1)* %gep
+
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
+
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %tex
}
@@ -89,7 +90,7 @@ main_body:
;CHECK: s_mov_b64 exec, [[SAVED]]
;CHECK: %IF
;CHECK: image_sample
-define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ELSE
@@ -100,8 +101,7 @@ IF:
br label %END
ELSE:
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
- store float %data, float addrspace(1)* %gep
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
br label %END
END:
@@ -129,7 +129,7 @@ END:
;CHECK: s_or_b64 exec, exec,
;CHECK: v_mov_b32_e32 v0
;CHECK: ; return
-define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
+define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %ELSE, label %IF
@@ -140,8 +140,7 @@ IF:
br label %END
ELSE:
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
- store float %data, float addrspace(1)* %gep
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
br label %END
END:
@@ -163,23 +162,20 @@ END:
;CHECK: store
;CHECK: s_wqm_b64 exec, exec
;CHECK: v_cmp
-define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
main_body:
%idx.1 = extractelement <3 x i32> %idx, i32 0
- %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
%data.1 = extractelement <2 x float> %data, i32 0
- store float %data.1, float addrspace(1)* %gep.1
+ call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
; The load that determines the branch (and should therefore be WQM) is
; surrounded by stores that require disabled WQM.
%idx.2 = extractelement <3 x i32> %idx, i32 1
- %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
- %z = load float, float addrspace(1)* %gep.2
+ %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
%idx.3 = extractelement <3 x i32> %idx, i32 2
- %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
%data.3 = extractelement <2 x float> %data, i32 1
- store float %data.3, float addrspace(1)* %gep.3
+ call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
%cc = fcmp ogt float %z, 0.0
br i1 %cc, label %IF, label %ELSE
@@ -210,24 +206,21 @@ END:
;CHECK: load
;CHECK: store
;CHECK: v_cmp
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tex.1 = extractelement <4 x float> %tex, i32 0
%idx.1 = extractelement <3 x i32> %idx, i32 0
- %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
%data.1 = extractelement <2 x float> %data, i32 0
- store float %data.1, float addrspace(1)* %gep.1
+ call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
%idx.2 = extractelement <3 x i32> %idx, i32 1
- %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
- %z = load float, float addrspace(1)* %gep.2
+ %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
%idx.3 = extractelement <3 x i32> %idx, i32 2
- %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
%data.3 = extractelement <2 x float> %data, i32 1
- store float %data.3, float addrspace(1)* %gep.3
+ call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
%cc = fcmp ogt float %z, 0.0
br i1 %cc, label %IF, label %ELSE
@@ -258,15 +251,14 @@ END:
;CHECK: s_mov_b64 exec, [[SAVE]]
;CHECK: %END
;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
main_body:
%cond = icmp eq i32 %y, 0
br i1 %cond, label %IF, label %END
IF:
- %data = load float, float addrspace(1)* %ptr
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
- store float %data, float addrspace(1)* %gep
+ %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
br label %END
END:
@@ -282,13 +274,11 @@ END:
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
-;SI: buffer_store_dword
-;VI: flat_store_dword
+;CHECK: buffer_store_dword
;CHECK: s_wqm_b64 exec, exec
;CHECK: v_cmpx_
;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
-;SI: buffer_store_dword
-;VI: flat_store_dword
+;CHECK: buffer_store_dword
;CHECK: s_mov_b64 exec, [[SAVE]]
;CHECK: image_sample
define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
@@ -296,16 +286,14 @@ main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%idx.0 = extractelement <2 x i32> %idx, i32 0
- %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
%data.0 = extractelement <2 x float> %data, i32 0
- store float %data.0, float addrspace(1)* %gep.0
+ call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
call void @llvm.AMDGPU.kill(float %z)
%idx.1 = extractelement <2 x i32> %idx, i32 1
- %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
%data.1 = extractelement <2 x float> %data, i32 1
- store float %data.1, float addrspace(1)* %gep.1
+ call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
%tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%out = fadd <4 x float> %tex, %tex2
@@ -321,16 +309,14 @@ main_body:
; CHECK: s_wqm_b64 exec, exec
; CHECK: image_sample
; CHECK: s_and_b64 exec, exec, [[ORIG]]
-; SI: buffer_store_dword
-; VI: flat_store_dword
+; CHECK: buffer_store_dword
; CHECK-NOT: wqm
; CHECK: v_cmpx_
-define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
- store float %data, float addrspace(1)* %gep
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
call void @llvm.AMDGPU.kill(float %z)
@@ -388,9 +374,53 @@ break:
ret <4 x float> %c.iv
}
+; Only intrinsic stores need exact execution -- other stores do not have
+; externally visible effects and may require WQM for correctness.
+;
+; CHECK-LABEL: {{^}}test_alloca:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
+
+; CHECK: image_sample
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: buffer_store_dwordx4
+define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
+entry:
+ %array = alloca [32 x i32], align 4
+
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+ %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
+ store volatile i32 %a, i32* %s.gep, align 4
+
+ call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
+
+ %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
+ %c = load i32, i32* %c.gep, align 4
+
+ %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
+
+ ret void
+}
+
+
declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
More information about the llvm-branch-commits
mailing list