[llvm] 716ca2e - [AMDGPU] Pre-sink IR input for some tests
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 21 06:27:45 PDT 2022
Author: Jay Foad
Date: 2022-07-21T14:25:44+01:00
New Revision: 716ca2e3ef37c9aa9517216042653d8a88d76ecf
URL: https://github.com/llvm/llvm-project/commit/716ca2e3ef37c9aa9517216042653d8a88d76ecf
DIFF: https://github.com/llvm/llvm-project/commit/716ca2e3ef37c9aa9517216042653d8a88d76ecf.diff
LOG: [AMDGPU] Pre-sink IR input for some tests
Edit the IR input for some codegen tests to simulate what the IR code
sinking pass would do to it. This makes the tests immune to the presence
or absence of the code sinking pass in the codegen pass pipeline, which
does not belong there.
Differential Revision: https://reviews.llvm.org/D130169
Added:
Modified:
llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
llvm/test/CodeGen/AMDGPU/multilevel-break.ll
llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
llvm/test/CodeGen/AMDGPU/wqm.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 746f2591db7d..af883e8d6a91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -168,11 +168,11 @@ define void @constrained_if_register_class() {
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
%tmp = load i32, i32 addrspace(4)* @external_constant
- %ptr = load float*, float* addrspace(4)* @const.ptr
%tmp1 = icmp ne i32 %tmp, 0
br i1 %tmp1, label %bb12, label %bb2
bb2:
+ %ptr = load float*, float* addrspace(4)* @const.ptr
%tmp4 = load float, float* %ptr, align 4
%tmp5 = fcmp olt float %tmp4, 1.0
%tmp6 = or i1 %tmp5, false
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a94719e66cea..908874e073e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -1536,7 +1536,6 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
; GFX11_W64-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
@@ -1555,6 +1554,7 @@ bb:
exit:
%cond = phi i1 [false, %entry], [%cmp1, %bb]
+ %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond)
store float %result, float addrspace(1)* %gep.out, align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 3c729182d1b0..fb5d81350808 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -793,7 +793,6 @@ bb:
bb9: ; preds = %bb12, %bb
%i10 = phi i64 [ %arg3, %bb ], [ %i13, %bb12 ]
- %i11 = icmp slt i64 %i10, 0
br i1 undef, label %bb14, label %bb12
bb12: ; preds = %bb58, %bb9
@@ -801,6 +800,7 @@ bb12: ; preds = %bb58, %bb9
br label %bb9
bb14: ; preds = %bb9
+ %i11 = icmp slt i64 %i10, 0
%i15 = load i64, i64 addrspace(1)* null, align 8
br label %bb16
@@ -825,23 +825,23 @@ bb16: ; preds = %bb58, %bb14
%i34 = getelementptr inbounds [16 x half], [16 x half] addrspace(1)* null, i64 %i24, i64 14
%i35 = bitcast half addrspace(1)* %i34 to <2 x half> addrspace(1)*
%i36 = load volatile <2 x half>, <2 x half> addrspace(1)* %i35, align 4
+ %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
+ %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
+ fence syncscope("workgroup") acquire
+ br i1 %i11, label %bb58, label %bb51
+
+bb51: ; preds = %bb16
%i37 = fpext <2 x half> %arg4 to <2 x float>
%i39 = fpext <2 x half> %i27 to <2 x float>
%i40 = fpext <2 x half> %i30 to <2 x float>
%i41 = fpext <2 x half> %i33 to <2 x float>
%i42 = fpext <2 x half> %i36 to <2 x float>
- %i43 = load volatile <2 x float>, <2 x float> addrspace(3)* null, align 8
%i44 = fadd contract <2 x float> %i37, %i43
%i45 = fadd contract <2 x float> %i43, zeroinitializer
- %i46 = load volatile <2 x float>, <2 x float> addrspace(3)* undef, align 32
%i47 = fadd contract <2 x float> %i39, %i46
%i48 = fadd contract <2 x float> %i40, %i43
%i49 = fadd contract <2 x float> %i41, zeroinitializer
%i50 = fadd contract <2 x float> %i42, zeroinitializer
- fence syncscope("workgroup") acquire
- br i1 %i11, label %bb58, label %bb51
-
-bb51: ; preds = %bb16
%i52 = fadd contract <2 x float> %i18, %i44
%i53 = fadd contract <2 x float> %i19, %i45
%i54 = fadd contract <2 x float> %i20, %i47
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index a4e2af802b73..ecc951a9cacd 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -508,11 +508,11 @@ define amdgpu_kernel void @long_branch_hang(i32 addrspace(1)* nocapture %arg, i3
bb:
%tmp = icmp slt i32 %arg2, 9
%tmp6 = icmp eq i32 %arg1, 0
- %tmp7 = icmp sgt i32 %arg4, 0
%tmp8 = icmp sgt i32 %arg4, 5
br i1 %tmp8, label %bb9, label %bb13
bb9: ; preds = %bb
+ %tmp7 = icmp sgt i32 %arg4, 0
%tmp10 = and i1 %tmp7, %tmp
%tmp11 = icmp slt i32 %arg3, %arg4
%tmp12 = or i1 %tmp11, %tmp7
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index e46cf5c5f910..26966ff49372 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -8,19 +8,19 @@
define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
; OPT-NEXT: entry:
-; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #3
+; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
-; OPT-NEXT: [[TMP0:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to i32 addrspace(1)*
-; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[TMP1]], i32 2)
+; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 7
+; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* [[IN_GEP]], i32 2)
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: store i32 [[X]], i32 addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT: br label [[DONE:%.*]]
+; OPT: done:
; OPT-NEXT: ret void
;
; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32:
@@ -43,18 +43,18 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252
; GCN-NEXT: s_endpgm
entry:
- %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
- %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%cmp = icmp eq i32 %tid, 0
br i1 %cmp, label %endif, label %if
if:
+ %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 7
%val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %in.gep, i32 2)
br label %endif
endif:
%x = phi i32 [ %val, %if ], [ 0, %entry ]
+ %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 999999
store i32 %x, i32 addrspace(1)* %out.gep
br label %done
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 8c496d552d71..66e4d59fe3d2 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -7,20 +7,20 @@
define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
; OPT-LABEL: @test_sink_small_offset_global_atomic_fadd_f32(
; OPT-NEXT: entry:
-; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
-; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) [[ATTR3:#.*]]
+; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
-; OPT-NEXT: [[TMP0:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to i8 addrspace(1)*
-; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 28
-; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SUNKADDR]] to float addrspace(1)*
-; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[TMP1]], float 2.000000e+00)
+; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr float, float addrspace(1)* [[IN:%.*]], i32 7
+; OPT-NEXT: [[FADD2:%.*]] = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* [[IN_GEP]], float 2.000000e+00)
; OPT-NEXT: [[VAL:%.*]] = load volatile float, float addrspace(1)* undef, align 4
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
; OPT-NEXT: [[X:%.*]] = phi float [ [[VAL]], [[IF]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 999999
; OPT-NEXT: store float [[X]], float addrspace(1)* [[OUT_GEP]], align 4
+; OPT-NEXT: br label [[DONE:%.*]]
+; OPT: done:
; OPT-NEXT: ret void
;
; GCN-LABEL: test_sink_small_offset_global_atomic_fadd_f32:
@@ -45,19 +45,19 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(float a
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:2300
; GCN-NEXT: s_endpgm
entry:
- %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
- %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%cmp = icmp eq i32 %tid, 0
br i1 %cmp, label %endif, label %if
if:
+ %in.gep = getelementptr float, float addrspace(1)* %in, i32 7
%fadd2 = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %in.gep, float 2.0)
%val = load volatile float, float addrspace(1)* undef
br label %endif
endif:
%x = phi float [ %val, %if ], [ 0.0, %entry ]
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 999999
store float %x, float addrspace(1)* %out.gep
br label %done
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c9d23d76fdfd..92682a4b0117 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -11,13 +11,13 @@
define protected amdgpu_kernel void @_Z11test_kernelPii(i32 addrspace(1)* nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
entry:
- %rem.lhs.trunc = trunc i32 %s to i16
- %rem4 = urem i16 %rem.lhs.trunc, 12
- %rem.zext = zext i16 %rem4 to i32
%cmp = icmp eq i32 %s, 3
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
+ %rem.lhs.trunc = trunc i32 %s to i16
+ %rem4 = urem i16 %rem.lhs.trunc, 12
+ %rem.zext = zext i16 %rem4 to i32
%idxprom = zext i32 %s to i64
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %Ad.coerce, i64 %idxprom
%div = lshr i32 %rem.zext, 3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
index 1244ab28c14f..c04ab319cc8c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -254,12 +254,6 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: ; return to shader part epilog
main_body:
- %c.bc = bitcast i32 %c to float
- %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
- %tex0 = extractelement <4 x float> %tex, i32 0
- %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
- %data.sample = extractelement <4 x float> %dtex, i32 0
-
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ELSE
@@ -271,6 +265,12 @@ IF:
br label %END
ELSE:
+ %c.bc = bitcast i32 %c to float
+ %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+ %tex0 = extractelement <4 x float> %tex, i32 0
+ %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+ %data.sample = extractelement <4 x float> %dtex, i32 0
+
call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
br label %END
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 6f8ebbcc73d1..cfe2357f0fa8 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -10,36 +10,37 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
; OPT-NEXT: main_body:
; OPT-NEXT: br label [[LOOP_OUTER:%.*]]
; OPT: LOOP.outer:
-; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP9:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
+; OPT-NEXT: [[PHI_BROKEN2:%.*]] = phi i64 [ [[TMP10:%.*]], [[FLOW1:%.*]] ], [ 0, [[MAIN_BODY:%.*]] ]
; OPT-NEXT: [[TMP43:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP4:%.*]], [[FLOW1]] ]
; OPT-NEXT: br label [[LOOP:%.*]]
; OPT: LOOP:
-; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP7:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
+; OPT-NEXT: [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP8:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
; OPT-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP4]], [[FLOW]] ]
-; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP47:%.*]], [[FLOW]] ]
-; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1
+; OPT-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP5:%.*]], [[FLOW]] ]
; OPT-NEXT: [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]]
; OPT-NEXT: [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
; OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
; OPT-NEXT: [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
; OPT-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
; OPT: Flow:
-; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
-; OPT-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
-; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
+; OPT-NEXT: [[TMP4]] = phi i32 [ [[TMP47:%.*]], [[ENDIF]] ], [ [[TMP0]], [[LOOP]] ]
+; OPT-NEXT: [[TMP5]] = phi i32 [ [[TMP47]], [[ENDIF]] ], [ undef, [[LOOP]] ]
+; OPT-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP51:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
+; OPT-NEXT: [[TMP7:%.*]] = phi i1 [ [[TMP51_INV:%.*]], [[ENDIF]] ], [ true, [[LOOP]] ]
; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
-; OPT-NEXT: [[TMP7]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN]])
-; OPT-NEXT: [[TMP8:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP7]])
-; OPT-NEXT: [[TMP9]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP5]], i64 [[PHI_BROKEN2]])
-; OPT-NEXT: br i1 [[TMP8]], label [[FLOW1]], label [[LOOP]]
+; OPT-NEXT: [[TMP8]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP7]], i64 [[PHI_BROKEN]])
+; OPT-NEXT: [[TMP9:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP8]])
+; OPT-NEXT: [[TMP10]] = call i64 @llvm.amdgcn.if.break.i64(i1 [[TMP6]], i64 [[PHI_BROKEN2]])
+; OPT-NEXT: br i1 [[TMP9]], label [[FLOW1]], label [[LOOP]]
; OPT: Flow1:
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
-; OPT-NEXT: [[TMP10:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP9]])
-; OPT-NEXT: br i1 [[TMP10]], label [[IF:%.*]], label [[LOOP_OUTER]]
+; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
+; OPT-NEXT: [[TMP11:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP10]])
+; OPT-NEXT: br i1 [[TMP11]], label [[IF:%.*]], label [[LOOP_OUTER]]
; OPT: IF:
-; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP9]])
+; OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP10]])
; OPT-NEXT: ret void
; OPT: ENDIF:
+; OPT-NEXT: [[TMP47]] = add i32 [[TMP45]], 1
; OPT-NEXT: [[TMP51]] = icmp eq i32 [[TMP47]], [[CONT:%.*]]
; OPT-NEXT: [[TMP51_INV]] = xor i1 [[TMP51]], true
; OPT-NEXT: br label [[FLOW]]
@@ -98,7 +99,6 @@ LOOP.outer: ; preds = %ENDIF, %main_body
LOOP: ; preds = %ENDIF, %LOOP.outer
%tmp45 = phi i32 [ %tmp43, %LOOP.outer ], [ %tmp47, %ENDIF ]
- %tmp47 = add i32 %tmp45, 1
%tmp48 = icmp slt i32 %tmp45, %ub
br i1 %tmp48, label %ENDIF, label %IF
@@ -106,6 +106,7 @@ IF: ; preds = %LOOP
ret void
ENDIF: ; preds = %LOOP
+ %tmp47 = add i32 %tmp45, 1
%tmp51 = icmp eq i32 %tmp47, %cont
br i1 %tmp51, label %LOOP, label %LOOP.outer
}
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index f3089e79ad37..640f240f143f 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -190,16 +190,16 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; GCN-NEXT: s_endpgm
; IR-LABEL: @nested_loop_conditions(
; IR-NEXT: bb:
+; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
+; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
+; IR: bb14.lr.ph:
; IR-NEXT: [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
; IR-NEXT: [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
; IR-NEXT: [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
; IR-NEXT: [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
; IR-NEXT: [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
; IR-NEXT: [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT: [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
-; IR-NEXT: [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
-; IR-NEXT: br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
-; IR: bb14.lr.ph:
; IR-NEXT: br label [[BB14:%.*]]
; IR: Flow3:
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
@@ -277,17 +277,17 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
; IR-NEXT: ret void
bb:
+ %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
+ %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
+ br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
+
+bb14.lr.ph: ; preds = %bb
%my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%my.tmp1 = zext i32 %my.tmp to i64
%my.tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %my.tmp1
%my.tmp3 = load i64, i64 addrspace(1)* %my.tmp2, align 16
%my.tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
%my.tmp1033 = extractelement <4 x i32> %my.tmp932, i64 0
- %my.tmp1134 = load volatile i32, i32 addrspace(1)* undef
- %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
- br i1 %my.tmp1235, label %bb14.lr.ph, label %bb13
-
-bb14.lr.ph: ; preds = %bb
br label %bb14
bb4.bb13_crit_edge: ; preds = %bb21
diff --git a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
index 26d0c050c884..c0d276f5d88c 100644
--- a/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll
@@ -21,7 +21,8 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_or_b32 s1, s0, s1
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
; GFX10-NEXT: s_cbranch_execz .LBB0_4
-; GFX10-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: .LBB0_2: ; %bb
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_or_b32 s2, s2, exec_lo
; GFX10-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX10-NEXT: s_cbranch_execz .LBB0_1
@@ -50,20 +51,20 @@ define amdgpu_cs void @_amdgpu_cs_main(float %0, i32 %1) {
; GFX10-NEXT: s_inst_prefetch 0x2
; GFX10-NEXT: s_endpgm
branch1_true:
- br label %2
+ br label %bb
-2: ; preds = %branch2_merge, %branch1_true
+bb: ; preds = %branch2_merge, %branch1_true
%r1.8.vec.insert14.i1 = phi float [ 0.000000e+00, %branch1_true ], [ %0, %branch2_merge ]
- %3 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
- %4 = icmp eq i32 %1, 0
- br i1 %4, label %loop0_merge, label %branch2_merge
+ %i = icmp eq i32 %1, 0
+ br i1 %i, label %loop0_merge, label %branch2_merge
-branch2_merge: ; preds = %2
- %5 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %3, float %0, float 0.000000e+00)
- %6 = fcmp ult float %5, 0.000000e+00
- br i1 %6, label %2, label %loop0_merge
+branch2_merge: ; preds = %bb
+ %i2 = call float @llvm.amdgcn.image.sample.lz.3d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float %r1.8.vec.insert14.i1, <8 x i32> zeroinitializer, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+ %i3 = call reassoc nnan nsz arcp contract afn float @llvm.fma.f32(float %i2, float %0, float 0.000000e+00)
+ %i4 = fcmp ult float %i3, 0.000000e+00
+ br i1 %i4, label %bb, label %loop0_merge
-loop0_merge: ; preds = %branch2_merge, %2
+loop0_merge: ; preds = %branch2_merge, %bb
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
index a0deb4572e0d..5514b91beea9 100644
--- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll
@@ -36,7 +36,6 @@ define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output,
.entry:
%LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
%.not10002 = icmp eq i32 %LocalInvocationId.i0, 0
- %i530 = icmp ult i32 %LocalInvocationId.i0, 4
br i1 %.not10002, label %.merge, label %.bb0
.bb0:
@@ -44,6 +43,7 @@ define amdgpu_cs void @if_then(<4 x i32> inreg %input, <4 x i32> inreg %output,
.merge:
%src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ]
+ %i530 = icmp ult i32 %LocalInvocationId.i0, 4
br i1 %i530, label %.end, label %.then
.then:
@@ -103,7 +103,6 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg
.entry:
%LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i32 0
%.not10002 = icmp eq i32 %LocalInvocationId.i0, 0
- %i530 = icmp ult i32 %LocalInvocationId.i0, 4
br i1 %.not10002, label %.merge, label %.bb0
.bb0:
@@ -111,6 +110,7 @@ define amdgpu_cs void @if_else_vgpr_opt(<4 x i32> inreg %input, <4 x i32> inreg
.merge:
%src = phi i32 [ 0, %.entry ], [ 1, %.bb0 ]
+ %i530 = icmp ult i32 %LocalInvocationId.i0, 4
br i1 %i530, label %.then, label %.else
.then:
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index a9c96d3fbb66..ada6c1da04e2 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1374,10 +1374,10 @@ define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1
; GFX11-NEXT: s_endpgm
bb:
%tmp = fcmp ult float %arg1, 0.000000e+00
- %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
br i1 %tmp, label %bb6, label %bb3
bb3: ; preds = %bb
+ %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
br i1 %tmp2, label %bb5, label %bb4
bb4: ; preds = %bb3
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 3414e70d36e3..7cb57ab6b0c3 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -260,10 +260,10 @@ define amdgpu_kernel void @icmp_users_
diff erent_blocks(i32 %cond0, i32 %cond1, i
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%cmp0 = icmp sgt i32 %cond0, 0
- %cmp1 = icmp sgt i32 %cond1, 0
br i1 %cmp0, label %bb2, label %bb9
bb2: ; preds = %bb
+ %cmp1 = icmp sgt i32 %cond1, 0
%tmp2 = sext i1 %cmp1 to i32
%tmp3 = add i32 %tmp2, %tmp
br i1 %cmp1, label %bb9, label %bb7
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index d1e867e526bc..dc85462631d4 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -2372,14 +2372,14 @@ define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
- %c.bc = bitcast i32 %c to float
- %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
- %tex0 = extractelement <4 x float> %tex, i32 0
- %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ENDIF
IF:
+ %c.bc = bitcast i32 %c to float
+ %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+ %tex0 = extractelement <4 x float> %tex, i32 0
+ %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dataf = extractelement <4 x float> %dtex, i32 0
%data1 = fptosi float %dataf to i32
%data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
@@ -2909,14 +2909,14 @@ define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
- %c.bc = bitcast i32 %c to float
- %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
- %tex0 = extractelement <4 x float> %tex, i32 0
- %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ENDIF
IF:
+ %c.bc = bitcast i32 %c to float
+ %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+ %tex0 = extractelement <4 x float> %tex, i32 0
+ %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dataf = extractelement <4 x float> %dtex, i32 0
%data1 = fptosi float %dataf to i32
%data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
@@ -2992,14 +2992,14 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
- %c.bc = bitcast i32 %c to float
- %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
- %tex0 = extractelement <4 x float> %tex, i32 0
- %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ENDIF
IF:
+ %c.bc = bitcast i32 %c to float
+ %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
+ %tex0 = extractelement <4 x float> %tex, i32 0
+ %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dataf = extractelement <4 x float> %dtex, i32 0
%data1 = fptosi float %dataf to i32
%data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
More information about the llvm-commits
mailing list