[llvm] d8f651d - [AMDGPU] Enable structurizer workarounds by default
Sameer Sahasrabuddhe via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 9 00:45:21 PDT 2020
Author: Sameer Sahasrabuddhe
Date: 2020-06-09T13:14:15+05:30
New Revision: d8f651d3e8e2a49730a18926eb2325b7793638f8
URL: https://github.com/llvm/llvm-project/commit/d8f651d3e8e2a49730a18926eb2325b7793638f8
DIFF: https://github.com/llvm/llvm-project/commit/d8f651d3e8e2a49730a18926eb2325b7793638f8.diff
LOG: [AMDGPU] Enable structurizer workarounds by default
Reviewed By: nhaehnle
Differential Revision: https://reviews.llvm.org/D81211
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/test/CodeGen/AMDGPU/infinite-loop.ll
llvm/test/CodeGen/AMDGPU/multilevel-break.ll
llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4ec903e37653..36c0096a47fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -195,7 +195,7 @@ static cl::opt<bool> EnableScalarIRPasses(
static cl::opt<bool> EnableStructurizerWorkarounds(
"amdgpu-enable-structurizer-workarounds",
- cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(false),
+ cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
cl::Hidden);
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 6a759e7c1122..b2acc37493e4 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -21,7 +21,6 @@ define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
; IR: loop:
; IR-NEXT: store volatile i32 999, i32 addrspace(1)* [[OUT:%.*]], align 4
; IR-NEXT: br label [[LOOP]]
-;
entry:
br label %loop
@@ -59,7 +58,6 @@ define amdgpu_kernel void @infinite_loop_ret(i32 addrspace(1)* %out) {
; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]]
; IR: UnifiedReturnBlock:
; IR-NEXT: ret void
-;
entry:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%cond = icmp eq i32 %tmp, 1
@@ -119,7 +117,6 @@ define amdgpu_kernel void @infinite_loops(i32 addrspace(1)* %out) {
; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]]
; IR: DummyReturnBlock:
; IR-NEXT: ret void
-;
entry:
br i1 undef, label %loop1, label %loop2
@@ -140,33 +137,29 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
; SI-NEXT: s_cbranch_execz BB3_5
; SI-NEXT: ; %bb.1: ; %outer_loop.preheader
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0
-; SI-NEXT: s_mov_b64 s[2:3], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: BB3_2: ; %outer_loop
; SI-NEXT: ; =>This Loop Header: Depth=1
; SI-NEXT: ; Child Loop BB3_3 Depth 2
-; SI-NEXT: s_and_b64 s[8:9], exec, vcc
-; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
-; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: s_mov_b64 s[2:3], 0
; SI-NEXT: BB3_3: ; %inner_loop
; SI-NEXT: ; Parent Loop BB3_2 Depth=1
; SI-NEXT: ; => This Inner Loop Header: Depth=2
-; SI-NEXT: s_and_b64 s[10:11], exec, s[0:1]
-; SI-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1]
+; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
; SI-NEXT: s_cbranch_execnz BB3_3
-; SI-NEXT: ; %bb.4: ; %Flow
+; SI-NEXT: ; %bb.4: ; %loop.exit.guard
; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; SI-NEXT: s_or_b64 exec, exec, s[8:9]
-; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT: s_cbranch_execnz BB3_2
+; SI-NEXT: s_or_b64 exec, exec, s[2:3]
+; SI-NEXT: s_and_b64 vcc, exec, 0
+; SI-NEXT: s_cbranch_vccz BB3_2
; SI-NEXT: BB3_5: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_nest_ret(
@@ -184,7 +177,6 @@ define amdgpu_kernel void @infinite_loop_nest_ret(i32 addrspace(1)* %out) {
; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]]
; IR: UnifiedReturnBlock:
; IR-NEXT: ret void
-;
entry:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%cond1 = icmp eq i32 %tmp, 1
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 932e6ce11045..c34ae7a99df5 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -46,52 +46,47 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
;
; GCN-LABEL: multi_else_break:
; GCN: ; %bb.0: ; %main_body
-; GCN-NEXT: s_mov_b64 s[2:3], 0
+; GCN-NEXT: s_mov_b64 s[0:1], 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_branch BB0_2
-; GCN-NEXT: BB0_1: ; %Flow2
+; GCN-NEXT: BB0_1: ; %loop.exit.guard
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT: s_and_b64 s[0:1], exec, s[8:9]
-; GCN-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3]
-; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GCN-NEXT: s_and_b64 s[2:3], exec, s[2:3]
+; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN-NEXT: s_cbranch_execz BB0_6
; GCN-NEXT: BB0_2: ; %LOOP.outer
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB0_4 Depth 2
; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
-; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GCN-NEXT: ; implicit-def: $sgpr2_sgpr3
; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_branch BB0_4
; GCN-NEXT: BB0_3: ; %Flow
; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT: s_or_b64 exec, exec, s[10:11]
-; GCN-NEXT: s_and_b64 s[0:1], exec, s[6:7]
-; GCN-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5]
+; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7]
+; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execz BB0_1
; GCN-NEXT: BB0_4: ; %LOOP
; GCN-NEXT: ; Parent Loop BB0_2 Depth=1
; GCN-NEXT: ; => This Inner Loop Header: Depth=2
-; GCN-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v2
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v2, v4
-; GCN-NEXT: s_or_b64 s[8:9], s[8:9], exec
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v1
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4
+; GCN-NEXT: s_or_b64 s[2:3], s[2:3], exec
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec
-; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GCN-NEXT: s_cbranch_execz BB0_3
; GCN-NEXT: ; %bb.5: ; %ENDIF
; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
-; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], v5, v1
-; GCN-NEXT: s_andn2_b64 s[8:9], s[8:9], exec
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0
+; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
-; GCN-NEXT: s_and_b64 s[12:13], vcc, exec
-; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13]
-; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[0:1]
-; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_and_b64 s[10:11], vcc, exec
+; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GCN-NEXT: s_branch BB0_3
; GCN-NEXT: BB0_6: ; %IF
; GCN-NEXT: s_endpgm
@@ -204,7 +199,10 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
+; GCN-NEXT: s_mov_b64 s[6:7], -1
; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GCN-NEXT: s_mov_b64 s[10:11], -1
; GCN-NEXT: s_cbranch_vccnz BB1_6
; GCN-NEXT: ; %bb.3: ; %LeafBlock1
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
@@ -223,15 +221,11 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: BB1_5: ; %Flow3
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_mov_b64 s[10:11], 0
+; GCN-NEXT: BB1_6: ; %Flow
+; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: s_and_b64 vcc, exec, s[10:11]
; GCN-NEXT: s_cbranch_vccz BB1_1
-; GCN-NEXT: s_branch BB1_7
-; GCN-NEXT: BB1_6: ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT: s_mov_b64 s[8:9], 0
-; GCN-NEXT: s_mov_b64 s[6:7], -1
-; GCN-NEXT: s_and_b64 vcc, exec, -1
-; GCN-NEXT: s_cbranch_execz BB1_1
-; GCN-NEXT: BB1_7: ; %LeafBlock
+; GCN-NEXT: ; %bb.7: ; %LeafBlock
; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GCN-NEXT: s_and_b64 vcc, exec, vcc
@@ -247,9 +241,10 @@ define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
; GCN-NEXT: s_and_b64 s[10:11], vcc, exec
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11]
; GCN-NEXT: s_branch BB1_1
-; GCN-NEXT: BB1_9: ; %Flow6
+; GCN-NEXT: BB1_9: ; %loop.exit.guard
; GCN-NEXT: s_or_b64 exec, exec, s[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[4:5]
+; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GCN-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 66847f146bd3..8bdc05bafacd 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -14,28 +14,36 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
; GCN-NEXT: s_load_dword s0, s[0:1], 0x9
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: s_mov_b64 s[2:3], -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GCN-NEXT: ds_read_b64 v[0:1], v0
-; GCN-NEXT: s_mov_b32 s0, 0
-; GCN-NEXT: s_and_b64 vcc, exec, 0
-; GCN-NEXT: BB0_1: ; %bb5
+; GCN-NEXT: s_and_b64 s[0:1], exec, -1
+; GCN-NEXT: s_branch BB0_2
+; GCN-NEXT: BB0_1: ; %bb10
+; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT: s_cbranch_vccz BB0_4
+; GCN-NEXT: BB0_2: ; %bb5
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_cmp_lg_u32 s0, 1
-; GCN-NEXT: s_cbranch_scc0 BB0_3
-; GCN-NEXT: ; %bb.2: ; %bb10
-; GCN-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; GCN-NEXT: ; implicit-def: $sgpr0
+; GCN-NEXT: s_mov_b64 vcc, s[0:1]
; GCN-NEXT: s_cbranch_vccnz BB0_1
-; GCN-NEXT: s_branch BB0_5
-; GCN-NEXT: BB0_3: ; %bb8
+; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_2 Depth=1
+; GCN-NEXT: s_mov_b64 s[4:5], -1
+; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT: s_cbranch_vccnz BB0_2
+; GCN-NEXT: BB0_4: ; %loop.exit.guard
+; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT: s_cbranch_vccz BB0_7
+; GCN-NEXT: ; %bb.5: ; %bb8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v0, v0
; GCN-NEXT: s_and_b64 vcc, exec, 0
-; GCN-NEXT: BB0_4: ; %bb9
+; GCN-NEXT: BB0_6: ; %bb9
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: s_cbranch_vccz BB0_4
-; GCN-NEXT: BB0_5: ; %DummyReturnBlock
+; GCN-NEXT: s_cbranch_vccz BB0_6
+; GCN-NEXT: BB0_7: ; %DummyReturnBlock
; GCN-NEXT: s_endpgm
; IR-LABEL: @reduced_nested_loop_conditions(
; IR-NEXT: bb:
@@ -84,7 +92,6 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
; IR: bb23:
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
; IR-NEXT: ret void
-;
bb:
%my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp
@@ -268,7 +275,6 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
; IR-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
; IR-NEXT: ret void
-;
bb:
%my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%my.tmp1 = zext i32 %my.tmp to i64
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index fd3d3857404f..7387e98ae864 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -166,45 +166,72 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
-; SI-NEXT: s_load_dword s4, s[0:1], 0xc
-; SI-NEXT: s_brev_b32 s5, 44
+; SI-NEXT: s_load_dword s8, s[0:1], 0xc
+; SI-NEXT: s_brev_b32 s9, 44
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, 0
-; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], s3, 4
-; SI-NEXT: s_or_b64 s[8:9], s[0:1], s[2:3]
-; SI-NEXT: s_and_b64 s[0:1], exec, s[2:3]
-; SI-NEXT: s_and_b64 s[2:3], exec, s[8:9]
+; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1
+; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4
+; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3
+; SI-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5]
+; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3]
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cmp_lt_f32_e64 s[8:9], |v0|, s5
+; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9
+; SI-NEXT: s_and_b64 s[4:5], exec, s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, 3
-; SI-NEXT: BB3_1: ; %while.cond
+; SI-NEXT: s_branch BB3_4
+; SI-NEXT: BB3_1: ; %Flow6
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_mov_b64 s[10:11], 0
+; SI-NEXT: BB3_2: ; %Flow5
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_mov_b64 s[14:15], 0
+; SI-NEXT: BB3_3: ; %Flow
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_and_b64 vcc, exec, s[12:13]
+; SI-NEXT: s_cbranch_vccnz BB3_8
+; SI-NEXT: BB3_4: ; %while.cond
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: s_mov_b64 s[14:15], -1
+; SI-NEXT: s_mov_b64 s[10:11], -1
+; SI-NEXT: s_mov_b64 s[12:13], -1
; SI-NEXT: s_mov_b64 vcc, s[0:1]
-; SI-NEXT: s_cbranch_vccz BB3_5
-; SI-NEXT: ; %bb.2: ; %convex.exit
-; SI-NEXT: ; in Loop: Header=BB3_1 Depth=1
+; SI-NEXT: s_cbranch_vccz BB3_3
+; SI-NEXT: ; %bb.5: ; %convex.exit
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_mov_b64 s[10:11], -1
+; SI-NEXT: s_mov_b64 s[12:13], -1
; SI-NEXT: s_mov_b64 vcc, s[2:3]
-; SI-NEXT: s_cbranch_vccnz BB3_8
-; SI-NEXT: ; %bb.3: ; %if.end
-; SI-NEXT: ; in Loop: Header=BB3_1 Depth=1
-; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; SI-NEXT: s_cbranch_vccnz BB3_1
-; SI-NEXT: ; %bb.4: ; %if.else
-; SI-NEXT: ; in Loop: Header=BB3_1 Depth=1
+; SI-NEXT: s_cbranch_vccz BB3_2
+; SI-NEXT: ; %bb.6: ; %if.end
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_mov_b64 s[12:13], -1
+; SI-NEXT: s_mov_b64 vcc, s[4:5]
+; SI-NEXT: s_cbranch_vccz BB3_1
+; SI-NEXT: ; %bb.7: ; %if.else
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_mov_b64 s[12:13], 0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_branch BB3_1
-; SI-NEXT: BB3_5: ; %for.cond.preheader
+; SI-NEXT: BB3_8: ; %loop.exit.guard4
+; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT: s_and_b64 vcc, exec, s[10:11]
+; SI-NEXT: s_cbranch_vccz BB3_4
+; SI-NEXT: ; %bb.9: ; %loop.exit.guard
+; SI-NEXT: s_and_b64 vcc, exec, s[14:15]
+; SI-NEXT: s_cbranch_vccz BB3_13
+; SI-NEXT: ; %bb.10: ; %for.cond.preheader
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e8
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, s8, v0
; SI-NEXT: s_and_b64 vcc, exec, vcc
-; SI-NEXT: s_cbranch_vccz BB3_8
-; SI-NEXT: ; %bb.6: ; %for.body
+; SI-NEXT: s_cbranch_vccz BB3_13
+; SI-NEXT: ; %bb.11: ; %for.body
; SI-NEXT: s_and_b64 vcc, exec, 0
-; SI-NEXT: BB3_7: ; %self.loop
+; SI-NEXT: BB3_12: ; %self.loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
-; SI-NEXT: s_cbranch_vccz BB3_7
-; SI-NEXT: BB3_8: ; %DummyReturnBlock
+; SI-NEXT: s_cbranch_vccz BB3_12
+; SI-NEXT: BB3_13: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
;
; FLAT-LABEL: loop_land_info_assert:
@@ -213,44 +240,71 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: buffer_load_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x30
-; FLAT-NEXT: s_brev_b32 s5, 44
+; FLAT-NEXT: s_load_dword s8, s[0:1], 0x30
+; FLAT-NEXT: s_brev_b32 s9, 44
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: v_cmp_gt_i32_e64 s[0:1], s2, 0
-; FLAT-NEXT: v_cmp_lt_i32_e64 s[2:3], s3, 4
-; FLAT-NEXT: s_or_b64 s[8:9], s[0:1], s[2:3]
-; FLAT-NEXT: s_and_b64 s[0:1], exec, s[2:3]
-; FLAT-NEXT: s_and_b64 s[2:3], exec, s[8:9]
+; FLAT-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1
+; FLAT-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4
+; FLAT-NEXT: v_cmp_gt_i32_e64 s[2:3], s3, 3
+; FLAT-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
+; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5]
+; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3]
; FLAT-NEXT: s_waitcnt vmcnt(0)
-; FLAT-NEXT: v_cmp_lt_f32_e64 s[8:9], |v0|, s5
+; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9
+; FLAT-NEXT: s_and_b64 s[4:5], exec, s[4:5]
; FLAT-NEXT: v_mov_b32_e32 v0, 3
-; FLAT-NEXT: BB3_1: ; %while.cond
+; FLAT-NEXT: s_branch BB3_4
+; FLAT-NEXT: BB3_1: ; %Flow6
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_mov_b64 s[10:11], 0
+; FLAT-NEXT: BB3_2: ; %Flow5
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_mov_b64 s[14:15], 0
+; FLAT-NEXT: BB3_3: ; %Flow
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13]
+; FLAT-NEXT: s_cbranch_vccnz BB3_8
+; FLAT-NEXT: BB3_4: ; %while.cond
; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1
+; FLAT-NEXT: s_mov_b64 s[14:15], -1
+; FLAT-NEXT: s_mov_b64 s[10:11], -1
+; FLAT-NEXT: s_mov_b64 s[12:13], -1
; FLAT-NEXT: s_mov_b64 vcc, s[0:1]
-; FLAT-NEXT: s_cbranch_vccz BB3_5
-; FLAT-NEXT: ; %bb.2: ; %convex.exit
-; FLAT-NEXT: ; in Loop: Header=BB3_1 Depth=1
+; FLAT-NEXT: s_cbranch_vccz BB3_3
+; FLAT-NEXT: ; %bb.5: ; %convex.exit
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_mov_b64 s[10:11], -1
+; FLAT-NEXT: s_mov_b64 s[12:13], -1
; FLAT-NEXT: s_mov_b64 vcc, s[2:3]
-; FLAT-NEXT: s_cbranch_vccnz BB3_8
-; FLAT-NEXT: ; %bb.3: ; %if.end
-; FLAT-NEXT: ; in Loop: Header=BB3_1 Depth=1
-; FLAT-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; FLAT-NEXT: s_cbranch_vccnz BB3_1
-; FLAT-NEXT: ; %bb.4: ; %if.else
-; FLAT-NEXT: ; in Loop: Header=BB3_1 Depth=1
+; FLAT-NEXT: s_cbranch_vccz BB3_2
+; FLAT-NEXT: ; %bb.6: ; %if.end
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_mov_b64 s[12:13], -1
+; FLAT-NEXT: s_mov_b64 vcc, s[4:5]
+; FLAT-NEXT: s_cbranch_vccz BB3_1
+; FLAT-NEXT: ; %bb.7: ; %if.else
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_mov_b64 s[12:13], 0
; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_branch BB3_1
-; FLAT-NEXT: BB3_5: ; %for.cond.preheader
+; FLAT-NEXT: BB3_8: ; %loop.exit.guard4
+; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT: s_and_b64 vcc, exec, s[10:11]
+; FLAT-NEXT: s_cbranch_vccz BB3_4
+; FLAT-NEXT: ; %bb.9: ; %loop.exit.guard
+; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15]
+; FLAT-NEXT: s_cbranch_vccz BB3_13
+; FLAT-NEXT: ; %bb.10: ; %for.cond.preheader
; FLAT-NEXT: v_mov_b32_e32 v0, 0x3e8
-; FLAT-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
+; FLAT-NEXT: v_cmp_lt_i32_e32 vcc, s8, v0
; FLAT-NEXT: s_and_b64 vcc, exec, vcc
-; FLAT-NEXT: s_cbranch_vccz BB3_8
-; FLAT-NEXT: ; %bb.6: ; %for.body
+; FLAT-NEXT: s_cbranch_vccz BB3_13
+; FLAT-NEXT: ; %bb.11: ; %for.body
; FLAT-NEXT: s_and_b64 vcc, exec, 0
-; FLAT-NEXT: BB3_7: ; %self.loop
+; FLAT-NEXT: BB3_12: ; %self.loop
; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1
-; FLAT-NEXT: s_cbranch_vccz BB3_7
-; FLAT-NEXT: BB3_8: ; %DummyReturnBlock
+; FLAT-NEXT: s_cbranch_vccz BB3_12
+; FLAT-NEXT: BB3_13: ; %DummyReturnBlock
; FLAT-NEXT: s_endpgm
entry:
%cmp = icmp sgt i32 %c0, 0
More information about the llvm-commits
mailing list