[llvm] r338779 - [AMDGPU] Avoid using divergent value in mubuf addr64 descriptor
Tim Renouf via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 2 15:53:58 PDT 2018
Author: tpr
Date: Thu Aug 2 15:53:57 2018
New Revision: 338779
URL: http://llvm.org/viewvc/llvm-project?rev=338779&view=rev
Log:
[AMDGPU] Avoid using divergent value in mubuf addr64 descriptor
Summary:
This fixes a problem where a load from global+idx generated incorrect
code on <=gfx7 when the index is divergent.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D47383
Change-Id: Ib4d177d6254b1dd3f8ec0203fdddec94bd8bc5ed
Added:
llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=338779&r1=338778&r2=338779&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Thu Aug 2 15:53:57 2018
@@ -106,6 +106,8 @@ private:
bool isUniformBr(const SDNode *N) const;
+ MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
+
SDNode *glueCopyToM0(SDNode *N) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
@@ -372,6 +374,22 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
+MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
+ EVT VT) const {
+ SDNode *Lo = CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
+ SDNode *Hi =
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
+ SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+ SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
+
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
+}
+
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
@@ -557,19 +575,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *
}
SDLoc DL(N);
- SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
- MVT::i32));
- SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
- const SDValue Ops[] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
- SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
- SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
- };
-
- ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
- N->getValueType(0), Ops));
+ ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
return;
}
case ISD::LOAD:
@@ -1014,55 +1020,72 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDV
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ ConstantSDNode *C1 = nullptr;
+ SDValue N0 = Addr;
if (CurDAG->isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+ C1 = cast<ConstantSDNode>(Addr.getOperand(1));
+ if (isUInt<32>(C1->getZExtValue()))
+ N0 = Addr.getOperand(0);
+ else
+ C1 = nullptr;
+ }
- if (N0.getOpcode() == ISD::ADD) {
- // (add (add N2, N3), C1) -> addr64
- SDValue N2 = N0.getOperand(0);
- SDValue N3 = N0.getOperand(1);
- Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+ if (N0.getOpcode() == ISD::ADD) {
+ // (add N2, N3) -> addr64, or
+ // (add (add N2, N3), C1) -> addr64
+ SDValue N2 = N0.getOperand(0);
+ SDValue N3 = N0.getOperand(1);
+ Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+
+ if (N2->isDivergent()) {
+ if (N3->isDivergent()) {
+ // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
+ // addr64, and construct the resource from a 0 address.
+ Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
+ VAddr = N0;
+ } else {
+ // N2 is divergent, N3 is not.
+ Ptr = N3;
+ VAddr = N2;
+ }
+ } else {
+ // N2 is not divergent.
Ptr = N2;
VAddr = N3;
- } else {
- // (add N0, C1) -> offset
- VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Ptr = N0;
}
-
- if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
- Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
- return true;
- }
-
- if (isUInt<32>(C1->getZExtValue())) {
- // Illegal offset, store it in soffset.
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
- SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
- 0);
- return true;
- }
- }
-
- if (Addr.getOpcode() == ISD::ADD) {
- // (add N0, N1) -> addr64
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ } else if (N0->isDivergent()) {
+ // N0 is divergent. Use it as the addr64, and construct the resource from a
+ // 0 address.
+ Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
+ VAddr = N0;
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
+ } else {
+ // N0 -> offset, or
+ // (N0 + C1) -> offset
+ VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = N0;
- VAddr = N1;
+ }
+
+ if (!C1) {
+ // No offset.
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;
}
- // default case -> offset
- VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
- Ptr = Addr;
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
+ // Legal offset for instruction.
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ return true;
+ }
+ // Illegal offset, store it in soffset.
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ SOffset =
+ SDValue(CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
+ 0);
return true;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll?rev=338779&r1=338778&r2=338779&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll Thu Aug 2 15:53:57 2018
@@ -3,7 +3,7 @@
; Type legalization for illegal FP type results was dropping invariant
; and dereferenceable flags.
-; GCN: BUFFER_LOAD_USHORT_OFFSET killed %{{[0-9]+}}, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 2 from %ir.ptr, addrspace 4)
+; GCN: BUFFER_LOAD_USHORT{{.*}} :: (dereferenceable invariant load 2 from %ir.ptr, addrspace 4)
define half @legalize_f16_load(half addrspace(4)* dereferenceable(4) %ptr) {
%load = load half, half addrspace(4)* %ptr, !invariant.load !0
%add = fadd half %load, 1.0
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll?rev=338779&r1=338778&r2=338779&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll Thu Aug 2 15:53:57 2018
@@ -59,10 +59,9 @@ define amdgpu_kernel void @opencl_kernel
; GCN-LABEL: {{^}}func_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA: s_mov_b64 s[8:9], s[6:7]
-; MESA: s_mov_b32 s11, 0xf000
-; MESA: s_mov_b32 s10, -1
-; MESA: buffer_load_dword v0, off, s[8:11], 0
+; MESA: v_mov_b32_e32 v0, s6
+; MESA: v_mov_b32_e32 v1, s7
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
@@ -77,10 +76,9 @@ define void @func_implicitarg_ptr() #0 {
; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA: s_mov_b64 s[8:9], s[6:7]
-; MESA: s_mov_b32 s11, 0xf000
-; MESA: s_mov_b32 s10, -1
-; MESA: buffer_load_dword v0, off, s[8:11], 0
+; MESA: v_mov_b32_e32 v0, s6
+; MESA: v_mov_b32_e32 v1, s7
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
@@ -164,16 +162,15 @@ define void @opencl_func_call_implicitar
; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA: s_mov_b64 s[12:13], s[6:7]
-; MESA: s_mov_b32 s15, 0xf000
-; MESA: s_mov_b32 s14, -1
-; MESA: buffer_load_dword v0, off, s[12:15], 0
+; MESA: v_mov_b32_e32 v0, s6
+; MESA: v_mov_b32_e32 v1, s7
+; MESA: v_mov_b32_e32 v2, s8
+; MESA: v_mov_b32_e32 v3, s9
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
-; MESA: s_mov_b32 s10, s14
-; MESA: s_mov_b32 s11, s15
-; MESA: buffer_load_dword v0, off, s[8:11], 0
+; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s8
; HSA: v_mov_b32_e32 v1, s9
; HSA: flat_load_dword v0, v[0:1]
@@ -191,16 +188,15 @@ define void @func_kernarg_implicitarg_pt
; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA: s_mov_b64 s[12:13], s[6:7]
-; MESA: s_mov_b32 s15, 0xf000
-; MESA: s_mov_b32 s14, -1
-; MESA: buffer_load_dword v0, off, s[12:15], 0
+; MESA: v_mov_b32_e32 v0, s6
+; MESA: v_mov_b32_e32 v1, s7
+; MESA: v_mov_b32_e32 v2, s8
+; MESA: v_mov_b32_e32 v3, s9
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
-; MESA: s_mov_b32 s10, s14
-; MESA: s_mov_b32 s11, s15
-; MESA: buffer_load_dword v0, off, s[8:11], 0
+; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s8
; HSA: v_mov_b32_e32 v1, s9
; HSA: flat_load_dword v0, v[0:1]
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll?rev=338779&r1=338778&r2=338779&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll Thu Aug 2 15:53:57 2018
@@ -9,7 +9,6 @@ declare <2 x half> @llvm.log.v2f16(<2 x
; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]]
; VI: flat_load_ushort v[[A_F16_0:[0-9]+]]
; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]]
-; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3f317218, v[[R_F32_0]]
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll?rev=338779&r1=338778&r2=338779&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll Thu Aug 2 15:53:57 2018
@@ -9,7 +9,6 @@ declare <2 x half> @llvm.log10.v2f16(<2
; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]]
; VI: flat_load_ushort v[[A_F16_0:[0-9]+]]
; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]]
-; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3e9a209a, v[[R_F32_0]]
Added: llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll?rev=338779&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll Thu Aug 2 15:53:57 2018
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s
+
+; Check that an addrspace(1) (const) load with various combinations of
+; uniform, nonuniform and constant address components all load with an
+; addr64 mubuf with no readfirstlane.
+
+ at indexable = internal unnamed_addr addrspace(1) constant [6 x <3 x float>] [<3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>, <3 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 1.000000e+00, float 0.000000e+00>]
+
+; GCN-LABEL: {{^}}nonuniform_uniform:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dwordx4 {{.*}} addr64
+
+define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
+.entry:
+ %tmp31 = sext i32 %arg18 to i64
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31
+ %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+ %tmp34 = extractelement <3 x float> %tmp33, i32 0
+ ret float %tmp34
+}
+
+; GCN-LABEL: {{^}}uniform_nonuniform:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dwordx4 {{.*}} addr64
+
+define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
+.entry:
+ %tmp1 = zext i32 %arg18 to i64
+ %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
+ %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+ %tmp34 = extractelement <3 x float> %tmp33, i32 0
+ ret float %tmp34
+}
+
+; GCN-LABEL: {{^}}const_nonuniform:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dwordx4 {{.*}} addr64
+
+define amdgpu_ps float @const_nonuniform(i32 %arg18) {
+.entry:
+ %tmp1 = zext i32 %arg18 to i64
+ %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 1
+ %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+ %tmp34 = extractelement <3 x float> %tmp33, i32 0
+ ret float %tmp34
+}
+
+; GCN-LABEL: {{^}}nonuniform_nonuniform:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dwordx4 {{.*}} addr64
+
+define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
+.entry:
+ %tmp1 = zext i32 %arg18 to i64
+ %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
+ %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
+ %tmp34 = extractelement <3 x float> %tmp33, i32 0
+ ret float %tmp34
+}
+
+; GCN-LABEL: {{^}}nonuniform_uniform_const:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dword {{.*}} addr64
+
+define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) {
+.entry:
+ %tmp31 = sext i32 %arg18 to i64
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1
+ %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+ ret float %tmp33
+}
+
+; GCN-LABEL: {{^}}uniform_nonuniform_const:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dword {{.*}} addr64
+
+define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) {
+.entry:
+ %tmp1 = zext i32 %arg18 to i64
+ %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
+ %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+ ret float %tmp33
+}
+
+; GCN-LABEL: {{^}}nonuniform_nonuniform_const:
+; GCN-NOT: readfirstlane
+; SICI: buffer_load_dword {{.*}} addr64
+
+define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) {
+.entry:
+ %tmp1 = zext i32 %arg18 to i64
+ %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
+ %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
+ %tmp33 = load float, float addrspace(1)* %tmp32, align 4
+ ret float %tmp33
+}
+
+
+
+
Modified: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll?rev=338779&r1=338778&r2=338779&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll Thu Aug 2 15:53:57 2018
@@ -201,8 +201,7 @@ exit:
; Initialize inner condition to false
; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
-; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
+; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}}
; Clear exec bits for workitems that load -1s
; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
More information about the llvm-commits
mailing list