[clang] [llvm] [AMDGPU] Support Wave Reduction for i16 types - 3 (PR #202912)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Jun 10 03:11:22 PDT 2026
https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/202912
[AMDGPU] Support Wave Reduction for i16 types - 3
Supported Ops: `and`, `or`, `xor`.
[AMDGPU] Add builtins for wave reduction intrinsics
Assisted by - Claude-sonnet:4.6
Missing SEMA tests
[AMDGPU] Support Wave Reduction for half types
Supported Ops: `fmin`, `fmax`, `fadd`, `fsub`.
>From 9931efe9a01ed69372d107a490a03c32a6f0aad9 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 28 Apr 2026 11:29:50 +0530
Subject: [PATCH 1/4] [AMDGPU] Support Wave Reduction for i16 types - 3
Supported Ops: `and`, `or`, `xor`.
>From c5a45715f7be504ca6f6cef728360f1cd82743d2 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Wed, 29 Apr 2026 10:07:32 +0530
Subject: [PATCH 2/4] [AMDGPU] Add builtins for wave reduction intrinsics
Assisted by - Claude-sonnet:4.6
---
clang/include/clang/Basic/BuiltinsAMDGPU.td | 9 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 18 ++
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 189 ++++++++++++++++++++
3 files changed, 216 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 8eed188b0f4b2..94431a4c83153 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -553,6 +553,15 @@ def __builtin_amdgcn_is_invocable : AMDGPUBuiltin<"__amdgpu_feature_predicate_t(
//===----------------------------------------------------------------------===//
+def __builtin_amdgcn_wave_reduce_add_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_sub_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_min_i16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_min_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_max_i16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_max_u16 : AMDGPUBuiltin<"unsigned short(unsigned short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_and_b16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_or_b16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>;
+def __builtin_amdgcn_wave_reduce_xor_b16 : AMDGPUBuiltin<"short(short, _Constant int32_t)", [Const]>;
def __builtin_amdgcn_wave_reduce_add_u32 : AMDGPUBuiltin<"uint32_t(uint32_t, _Constant int32_t)", [Const]>;
def __builtin_amdgcn_wave_reduce_sub_u32 : AMDGPUBuiltin<"uint32_t(uint32_t, _Constant int32_t)", [Const]>;
def __builtin_amdgcn_wave_reduce_min_i32 : AMDGPUBuiltin<"int32_t(int32_t, _Constant int32_t)", [Const]>;
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 21f32b12c4fd1..cf0e1d3a05a2d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -473,42 +473,51 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
switch (BuiltinID) {
default:
llvm_unreachable("Unknown BuiltinID for wave reduction");
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
return Intrinsic::amdgcn_wave_reduce_add;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64:
return Intrinsic::amdgcn_wave_reduce_fadd;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
return Intrinsic::amdgcn_wave_reduce_sub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64:
return Intrinsic::amdgcn_wave_reduce_fsub;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
return Intrinsic::amdgcn_wave_reduce_min;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64:
return Intrinsic::amdgcn_wave_reduce_fmin;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
return Intrinsic::amdgcn_wave_reduce_umin;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
return Intrinsic::amdgcn_wave_reduce_max;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64:
return Intrinsic::amdgcn_wave_reduce_fmax;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
return Intrinsic::amdgcn_wave_reduce_umax;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
return Intrinsic::amdgcn_wave_reduce_and;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
return Intrinsic::amdgcn_wave_reduce_or;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b16:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
return Intrinsic::amdgcn_wave_reduce_xor;
@@ -520,22 +529,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b16:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 11fbfdde92fa1..c1e3a086cbe6b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -944,6 +944,195 @@ void test_wave_reduce_max_u64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_u64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_add_u16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.add.i16(
+void test_wave_reduce_add_u16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_u16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_u16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.add.i16(
+void test_wave_reduce_add_u16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_u16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_u16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.add.i16(
+void test_wave_reduce_add_u16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_u16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_u16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.sub.i16(
+void test_wave_reduce_sub_u16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_u16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_u16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.sub.i16(
+void test_wave_reduce_sub_u16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_u16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_u16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.sub.i16(
+void test_wave_reduce_sub_u16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_u16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.min.i16(
+void test_wave_reduce_min_i16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.min.i16(
+void test_wave_reduce_min_i16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.min.i16(
+void test_wave_reduce_min_i16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umin.i16(
+void test_wave_reduce_min_u16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umin.i16(
+void test_wave_reduce_min_u16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umin.i16(
+void test_wave_reduce_min_u16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.max.i16(
+void test_wave_reduce_max_i16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.max.i16(
+void test_wave_reduce_max_i16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.max.i16(
+void test_wave_reduce_max_i16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umax.i16(
+void test_wave_reduce_max_u16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umax.i16(
+void test_wave_reduce_max_u16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.umax.i16(
+void test_wave_reduce_max_u16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.and.i16(
+void test_wave_reduce_and_b16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.and.i16(
+void test_wave_reduce_and_b16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.and.i16(
+void test_wave_reduce_and_b16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.or.i16(
+void test_wave_reduce_or_b16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.or.i16(
+void test_wave_reduce_or_b16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.or.i16(
+void test_wave_reduce_or_b16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b16(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b16_default
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.xor.i16(
+void test_wave_reduce_xor_b16_default(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b16(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b16_iterative
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.xor.i16(
+void test_wave_reduce_xor_b16_iterative(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b16(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b16_dpp
+// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.wave.reduce.xor.i16(
+void test_wave_reduce_xor_b16_dpp(global short* out, short in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b16(in, 2);
+}
+
// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()
>From c06ef7f5587aecf94f5c0de9846caf1b798c4c08 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Mon, 4 May 2026 14:39:06 +0530
Subject: [PATCH 3/4] Missing SEMA tests
---
.../wave-reduce-builtins-validate-amdgpu.cl | 26 +++++++++++++++++++
1 file changed, 26 insertions(+)
diff --git a/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl b/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl
index 0f1565f1272c1..373c771c178a3 100644
--- a/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl
+++ b/clang/test/Sema/wave-reduce-builtins-validate-amdgpu.cl
@@ -3,6 +3,32 @@
// Test that the second argument (strategy) must be a constant integer
+void test_wave_reduce_u16(unsigned short val, int strategy) {
+ (void)__builtin_amdgcn_wave_reduce_add_u16(val, 0);
+ (void)__builtin_amdgcn_wave_reduce_sub_u16(val, 1);
+ (void)__builtin_amdgcn_wave_reduce_min_u16(val, 0);
+ (void)__builtin_amdgcn_wave_reduce_max_u16(val, 0);
+
+ (void)__builtin_amdgcn_wave_reduce_add_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_add_u16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_sub_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_sub_u16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_min_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_min_u16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_max_u16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_max_u16' must be a constant integer}}
+}
+
+void test_wave_reduce_i16(short val, int strategy) {
+ (void)__builtin_amdgcn_wave_reduce_min_i16(val, 0);
+ (void)__builtin_amdgcn_wave_reduce_max_i16(val, 0);
+ (void)__builtin_amdgcn_wave_reduce_and_b16(val, 0);
+ (void)__builtin_amdgcn_wave_reduce_or_b16(val, 0);
+ (void)__builtin_amdgcn_wave_reduce_xor_b16(val, 0);
+
+ (void)__builtin_amdgcn_wave_reduce_min_i16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_min_i16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_max_i16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_max_i16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_and_b16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_and_b16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_or_b16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_or_b16' must be a constant integer}}
+ (void)__builtin_amdgcn_wave_reduce_xor_b16(val, strategy); // expected-error {{argument to '__builtin_amdgcn_wave_reduce_xor_b16' must be a constant integer}}
+}
+
void test_wave_reduce_u32(unsigned int val, int strategy) {
(void)__builtin_amdgcn_wave_reduce_add_u32(val, 0);
(void)__builtin_amdgcn_wave_reduce_sub_u32(val, 1);
>From 926675723f32477f8cb8f59bdecb83ea9ba4a568 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Wed, 10 Jun 2026 14:59:02 +0530
Subject: [PATCH 4/4] [AMDGPU] Support Wave Reduction for half types
Supported Ops: `fmin`, `fmax`, `fadd`, `fsub`.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 18 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll | 1801 +++++++++++---
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll | 2149 ++++++++++++-----
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll | 2149 ++++++++++++-----
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll | 1824 +++++++++++---
6 files changed, 6066 insertions(+), 1898 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e341392ae068a..baae6471642f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -8171,10 +8171,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
}
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_fmin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_fmax:
case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_fadd:
case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_fsub:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
@@ -8182,18 +8186,26 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
if (MRI.getType(SrcReg) != LLT::scalar(16))
return true;
Register DstReg = MI.getOperand(0).getReg();
+ bool IsFPOp = IntrID == Intrinsic::amdgcn_wave_reduce_fmin ||
+ IntrID == Intrinsic::amdgcn_wave_reduce_fmax ||
+ IntrID == Intrinsic::amdgcn_wave_reduce_fadd ||
+ IntrID == Intrinsic::amdgcn_wave_reduce_fsub;
bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
IntrID == Intrinsic::amdgcn_wave_reduce_max ||
IntrID == Intrinsic::amdgcn_wave_reduce_add ||
IntrID == Intrinsic::amdgcn_wave_reduce_sub;
- auto Ext = NeedsSignExt ? B.buildSExt(LLT::scalar(32), SrcReg)
- : B.buildZExt(LLT::scalar(32), SrcReg);
+ auto Ext = IsFPOp ? B.buildFPExt(LLT::scalar(32), SrcReg)
+ : NeedsSignExt ? B.buildSExt(LLT::scalar(32), SrcReg)
+ : B.buildZExt(LLT::scalar(32), SrcReg);
auto NewDst = MRI.createGenericVirtualRegister(LLT::scalar(32));
B.buildIntrinsic(IntrID, ArrayRef<Register>{NewDst},
/*hasSideEffects=*/false, /*isConvergent=*/true)
.addUse(Ext.getReg(0))
.addImm(MI.getOperand(3).getImm()); // strategy
- B.buildTrunc(DstReg, NewDst);
+ if (IsFPOp)
+ B.buildFPTrunc(DstReg, NewDst);
+ else
+ B.buildTrunc(DstReg, NewDst);
MI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a6dc58e6da263..da555eec89754 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10762,25 +10762,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_wave_reduce_min:
case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_fmin:
case Intrinsic::amdgcn_wave_reduce_max:
case Intrinsic::amdgcn_wave_reduce_umax:
+ case Intrinsic::amdgcn_wave_reduce_fmax:
case Intrinsic::amdgcn_wave_reduce_add:
+ case Intrinsic::amdgcn_wave_reduce_fadd:
case Intrinsic::amdgcn_wave_reduce_sub:
+ case Intrinsic::amdgcn_wave_reduce_fsub:
case Intrinsic::amdgcn_wave_reduce_and:
case Intrinsic::amdgcn_wave_reduce_or:
case Intrinsic::amdgcn_wave_reduce_xor: {
EVT SrcVT = Op.getOperand(1).getValueType();
- if (SrcVT == MVT::i16) {
+ if (SrcVT == MVT::i16 || SrcVT == MVT::f16) {
+ bool IsFPOp = IntrinsicID == Intrinsic::amdgcn_wave_reduce_fmin ||
+ IntrinsicID == Intrinsic::amdgcn_wave_reduce_fmax ||
+ IntrinsicID == Intrinsic::amdgcn_wave_reduce_fadd ||
+ IntrinsicID == Intrinsic::amdgcn_wave_reduce_fsub;
bool NeedsSignExt = IntrinsicID == Intrinsic::amdgcn_wave_reduce_min ||
IntrinsicID == Intrinsic::amdgcn_wave_reduce_max ||
IntrinsicID == Intrinsic::amdgcn_wave_reduce_add ||
IntrinsicID == Intrinsic::amdgcn_wave_reduce_sub;
- unsigned ExtOpc = NeedsSignExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- SDValue ExtendedSrc = DAG.getNode(ExtOpc, DL, MVT::i32, Op.getOperand(1));
+ unsigned ExtOpc = IsFPOp ? ISD::FP_EXTEND
+ : NeedsSignExt ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND;
+ auto SrcType = IsFPOp ? MVT::f16 : MVT::i16;
+ auto ExtType = IsFPOp ? MVT::f32 : MVT::i32;
+ SDValue ExtendedSrc = DAG.getNode(ExtOpc, DL, ExtType, Op.getOperand(1));
SDValue Strategy = Op.getOperand(2);
- SDValue Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ SDValue Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ExtType,
Op.getOperand(0), ExtendedSrc, Strategy);
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Result);
+ return DAG.getNode(IsFPOp ? ISD::FP_ROUND : ISD::TRUNCATE, DL, SrcType,
+ Result, DAG.getTargetConstant(0, DL, MVT::i32));
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
index 2f95542150a83..d4f078e670f88 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fadd.ll
@@ -7,16 +7,759 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1064GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1032DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
+define amdgpu_kernel void @uniform_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: uniform_value_half:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: v_add_f32_e32 v1, s4, v1
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_half:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_half:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_brev_b32 s4, 1
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: v_add_f32_e64 v2, s4, s6
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_half:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_brev_b32 s3, 1
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s4
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: v_add_f32_e64 v2, s3, s5
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_half:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s1
+; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_brev_b32 s4, 1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-FAKE16-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: s_brev_b32 s3, 1
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX1132GISEL-FAKE16-NEXT: s_bcnt1_i32_b32 s0, s1
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: v_dual_mul_f32 v0, s2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_brev_b32 s4, 1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s4
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, s2, v1
+; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: s_brev_b32 s3, 1
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s3
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX1132GISEL-TRUE16-NEXT: s_bcnt1_i32_b32 s0, s1
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: v_dual_mul_f32 v0, s2, v1 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_cvt_f32_f16 s2, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_2)
+; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12DAGISEL-NEXT: s_cvt_f16_f32 s2, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX12DAGISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT: s_endpgm
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fadd(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: divergent_value_half:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_half:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_brev_b32 s6, 1
+; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: v_add_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_half:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_brev_b32 s6, 1
+; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: v_add_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1
+; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: v_add_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_half:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_brev_b32 s6, 1
+; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: v_add_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1
+; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: v_add_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_half:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_brev_b32 s5, 1
+; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: v_add_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_brev_b32 s2, 1
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-FAKE16-NEXT: s_brev_b32 s2, 1
+; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-FAKE16-NEXT: v_add_f32_e64 v3, s2, s4
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: s_brev_b32 s1, 1
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-FAKE16-NEXT: s_brev_b32 s1, 1
+; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-FAKE16-NEXT: v_add_f32_e64 v3, s1, s3
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_brev_b32 s2, 1
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-TRUE16-NEXT: s_brev_b32 s2, 1
+; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-TRUE16-NEXT: v_add_f32_e64 v3, s2, s4
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: s_brev_b32 s1, 1
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-TRUE16-NEXT: s_brev_b32 s1, 1
+; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-TRUE16-NEXT: v_add_f32_e64 v3, s1, s3
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_brev_b32 s1, 1
+; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT: v_add_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: ; %bb.2:
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX12DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fadd(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL: ; Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: %bb.0: ; %entry
; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -48,7 +791,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL: ; %bb.0: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: ; %entry
; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -79,7 +826,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
;
; GFX1064DAGISEL-LABEL: uniform_value_float:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwoLowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: rd s2, s[4:5], 0x2c
; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -110,7 +861,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX1032DAGISEL-LABEL: uniform_value_float:
; GFX1032DAGISEL: ; %bb.0: ; %entry
; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT: s_movLowering intrinsic: 3655
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1032DAGISEL-NEXT: _b32 s0, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
@@ -232,7 +987,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -240,7 +995,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
@@ -252,7 +1007,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8GISEL-NEXT: s_brev_b32 s6, 1
-; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -260,7 +1015,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8GISEL-NEXT: v_add_f32_e32 v3, s6, v3
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
@@ -272,7 +1027,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -280,7 +1035,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -292,7 +1047,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9GISEL-NEXT: s_brev_b32 s6, 1
-; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -300,7 +1055,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9GISEL-NEXT: v_add_f32_e32 v3, s6, v3
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -312,14 +1067,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1
-; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064DAGISEL-NEXT: v_add_f32_e64 v3, s6, s8
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -330,14 +1085,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064GISEL-NEXT: s_brev_b32 s6, 1
-; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064GISEL-NEXT: v_add_f32_e64 v3, s6, s8
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -348,14 +1103,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1
-; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032DAGISEL-NEXT: v_add_f32_e64 v3, s5, s7
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -366,14 +1121,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032GISEL-NEXT: s_brev_b32 s5, 1
-; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032GISEL-NEXT: v_add_f32_e64 v3, s5, s7
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -384,7 +1139,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_brev_b32 s2, 1
-; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -393,7 +1148,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164DAGISEL-NEXT: v_add_f32_e64 v3, s2, s4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -404,7 +1159,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_brev_b32 s2, 1
-; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -413,7 +1168,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164GISEL-NEXT: v_add_f32_e64 v3, s2, s4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -424,7 +1179,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
-; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -433,7 +1188,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132DAGISEL-NEXT: v_add_f32_e64 v3, s1, s3
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -444,7 +1199,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132GISEL-NEXT: s_brev_b32 s1, 1
-; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -453,7 +1208,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132GISEL-NEXT: v_add_f32_e64 v3, s1, s3
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -468,7 +1223,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX12DAGISEL-NEXT: s_brev_b32 s1, 1
-; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -479,7 +1234,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX12DAGISEL-NEXT: v_add_f32_e64 v3, s1, s3
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1
@@ -508,7 +1263,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8DAGISEL-NEXT: s_nop 1
; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
-; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
@@ -571,7 +1328,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: s_nop 1
; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9DAGISEL-NEXT: s_nop 1
+; GFX9DAGISEL-NEXT: s_nopLowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: 1
; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: s_nop 1
; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
@@ -632,7 +1391,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
; GFX1064DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v5, 32, v5
-; GFX1064DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_adLowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: d_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v5, 4, v5
; GFX1064DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
@@ -710,7 +1471,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1032DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1032DAGISEL-NEXT: v_add_f32_e32 v3, v3Lowering intrinsic: 3655
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1032DAGISEL-NEXT: , v4
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v3, 31
; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
@@ -758,7 +1521,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill
+; GFX1164DAGISEL-NEXT: s_clause 0x2 Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: ; 12-byte Folded Spill
; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v3, s32
; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:8
@@ -842,37 +1607,39 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_float_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v3, s32
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v3, 0x80000000, v2, s0
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v3, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v3, off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: sLowering intrinsic: 3655
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-FAKE16-NEXT: cratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0x80000000, v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_float_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -906,11 +1673,47 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded SpLowering intrinsic: 3655
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-TRUE16-NEXT: ill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0x80000000, v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_float_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
-; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_waiLowering intrinsic: 3655
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX12DAGISEL-NEXT: t_samplecnt 0x0
; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
@@ -1014,7 +1817,17 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword vLowering intrinsic: 3713
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX8DAGISEL-NEXT: 6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -1165,7 +1978,17 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte FoldLowering intrinsic: 3713
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX9DAGISEL-NEXT: ed Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -1308,7 +2131,17 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
-; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7Lowering intrinsic: 3713
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1064DAGISEL-EMPTY:
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX1064DAGISEL-NEXT: s_clause 0x7 ; 32-byte Folded Reload
@@ -1503,81 +2336,87 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Spill
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
-; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v6, 32, v6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v6, 4, v6
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v6, v4
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v8, v6, v5
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[7:8]
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Reload
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NELowering intrinsic: 3713
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: XT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_value_double_dpp:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -1655,59 +2494,65 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6,Lowering intrinsic: 3713
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-FAKE16-NEXT: v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_double_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -1763,6 +2608,148 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instsLowering intrinsic: 3713
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: kip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_bLowering intrinsic: 3713
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1132DAGISEL-TRUE16-NEXT: 32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_double_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1799,7 +2786,13 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: v_add_f64_e32 v[4:5], v[4:5], v[6:7]
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, vLowering intrinsic: 3713
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX12DAGISEL-NEXT: 5
; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -1836,7 +2829,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1844,11 +2837,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1857,7 +2850,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1874,7 +2867,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1882,11 +2875,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1895,7 +2888,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: .LBB4_4: ; %endif
+; GFX8GISEL-NEXT: .LBB6_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1912,7 +2905,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr6
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1920,11 +2913,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1933,7 +2926,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1949,7 +2942,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1957,11 +2950,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1970,7 +2963,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9GISEL-NEXT: .LBB4_4: ; %endif
+; GFX9GISEL-NEXT: .LBB6_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1986,7 +2979,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1994,11 +2987,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2007,7 +3000,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1064DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2023,7 +3016,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2031,11 +3024,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2044,7 +3037,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1064GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB6_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2060,7 +3053,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -2068,12 +3061,12 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
@@ -2081,7 +3074,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1032DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2096,7 +3089,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr3
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -2104,12 +3097,12 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
@@ -2117,7 +3110,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s1, v0
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1032GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB6_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2125,49 +3118,55 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX1032GISEL-NEXT: s_endpgm
;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
-; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_4
-; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1164DAGISEL-NEXT: .LBB4_4: ; %endif
-; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_f32_e32 v0, s1, v0
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlanLowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-FAKE16-NEXT: e_b32 s0, v0
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_4: ; %endif
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_cfg_float:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -2178,7 +3177,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2188,12 +3187,12 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2204,7 +3203,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1164GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB6_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -2222,7 +3221,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2232,13 +3231,13 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2249,7 +3248,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1132DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2266,7 +3265,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2276,13 +3275,13 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2293,7 +3292,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1132GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB6_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2301,6 +3300,56 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_f32_e32 v0, s0, v0
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_2: ; %Flow
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_f32_e32 v0, s1, v0
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfiLowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-TRUE16-NEXT: rstlane_b32 s0, v0
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_4: ; %endif
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm
+;
; GFX12DAGISEL-LABEL: divergent_cfg_float:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
@@ -2310,7 +3359,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2320,14 +3369,14 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX12DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2340,7 +3389,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX12DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX12DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2602,7 +3651,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX8DAGISEL-NEXT: s_brev_b32 s7, 1
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2613,7 +3662,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2627,7 +3676,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
; GFX8GISEL-NEXT: s_brev_b32 s7, 1
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2638,7 +3687,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2652,7 +3701,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX9DAGISEL-NEXT: s_brev_b32 s7, 1
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2663,7 +3712,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2677,7 +3726,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
; GFX9GISEL-NEXT: s_brev_b32 s7, 1
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2688,7 +3737,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2702,7 +3751,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1064DAGISEL-NEXT: s_brev_b32 s7, 1
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2711,7 +3760,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064DAGISEL-NEXT: v_add_f64 v[4:5], s[8:9], s[6:7]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2724,7 +3773,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
; GFX1064GISEL-NEXT: s_brev_b32 s7, 1
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2733,7 +3782,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064GISEL-NEXT: v_add_f64 v[4:5], s[8:9], s[6:7]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2746,7 +3795,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2755,7 +3804,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032DAGISEL-NEXT: v_add_f64 v[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2768,7 +3817,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_brev_b32 s5, 1
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2777,7 +3826,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032GISEL-NEXT: v_add_f64 v[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2790,7 +3839,15 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0
; GFX1164DAGISEL-NEXT: s_brev_b32 s3, 1
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: Lowering intrinsic: 3713
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3655
+; GFX1164DAGISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2802,7 +3859,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2815,7 +3872,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164GISEL-NEXT: s_mov_b32 s2, 0
; GFX1164GISEL-NEXT: s_brev_b32 s3, 1
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2827,7 +3884,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2840,7 +3897,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2852,7 +3909,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2864,7 +3921,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_brev_b32 s1, 1
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2876,7 +3933,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2892,7 +3949,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_brev_b32 s1, 1
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2905,7 +3962,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2926,7 +3983,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2935,13 +3992,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX8DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2951,7 +4008,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX8DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX8DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2965,7 +4022,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2974,13 +4031,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX8GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
@@ -2992,7 +4049,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT: .LBB7_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3007,7 +4064,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3016,13 +4073,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3032,7 +4089,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX9DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3045,7 +4102,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3054,13 +4111,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX9GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -3072,7 +4129,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT: .LBB7_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3087,7 +4144,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3096,13 +4153,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1064DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3112,7 +4169,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1064DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3125,7 +4182,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3134,13 +4191,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1064GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3152,7 +4209,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3167,7 +4224,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
@@ -3176,13 +4233,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032DAGISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1032DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -3192,7 +4249,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3205,7 +4262,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
@@ -3214,13 +4271,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032GISEL-NEXT: v_mul_f64 v[0:1], s[2:3], v[0:1]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1032GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
@@ -3232,7 +4289,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3249,7 +4306,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3261,14 +4318,14 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1164DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3282,7 +4339,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1164DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3297,7 +4354,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3309,14 +4366,14 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1164GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
@@ -3331,7 +4388,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3348,7 +4405,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3360,13 +4417,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1132DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3378,7 +4435,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3393,7 +4450,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3405,13 +4462,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1132GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
@@ -3425,7 +4482,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3442,7 +4499,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3454,7 +4511,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX12DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
@@ -3462,7 +4519,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3476,7 +4533,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX12DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
index 4e7a6ac5891fb..ec2fe44824cfd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmax.ll
@@ -7,46 +7,702 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) #0 {
-; GFX8DAGISEL-LABEL: uniform_value_float:
+define amdgpu_kernel void @uniform_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: uniform_value_half:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: v_max_f32_e32 v1, s4, v1
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
-; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL-LABEL: uniform_value_half:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s2
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
-; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL-LABEL: uniform_value_half:
; GFX9DAGISEL: ; %bb.0: ; %entry
; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: v_max_f32_e32 v2, s4, v2
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9DAGISEL-NEXT: s_endpgm
;
+; GFX9GISEL-LABEL: uniform_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: v_max_f32_e64 v2, s4, s6
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_half:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX10GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0x7fc00000
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s4
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: v_max_f32_e64 v2, s3, s5
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s3, 0x7fc00000
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s4
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s3, 0x7fc00000
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s3
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_cvt_f32_f16 s2, s2
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12DAGISEL-NEXT: s_cvt_f16_f32 s2, s2
+; GFX12DAGISEL-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX12DAGISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT: s_endpgm
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fmax(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: divergent_value_half:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: v_max_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_half:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: v_max_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_half:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: v_max_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: v_max_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: v_max_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_half:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: v_max_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7fc00000
+; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: v_max_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_half:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7fc00000
+; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: v_max_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-FAKE16-NEXT: v_max_f32_e64 v3, s2, s4
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-FAKE16-NEXT: v_max_f32_e64 v3, s1, s3
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-TRUE16-NEXT: v_max_f32_e64 v3, s2, s4
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-TRUE16-NEXT: v_max_f32_e64 v3, s1, s3
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT: v_max_num_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: ; %bb.2:
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX12DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fmax(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) #0 {
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
; GFX9GISEL-LABEL: uniform_value_float:
; GFX9GISEL: ; %bb.0: ; %entry
; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
@@ -57,16 +713,20 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9GISEL-NEXT: s_endpgm
;
-; GFX10DAGISEL-LABEL: uniform_value_float:
-; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_clause 0x1
-; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10DAGISEL-NEXT: s_endpgm
+; GFX1064DAGISEL-LABEL: uniform_value_float:
+; GFX1064DAGISEL: ; %bb.0: ; %Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: uniform_value_float:
; GFX10GISEL: ; %bb.0: ; %entry
@@ -79,6 +739,21 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10GISEL-NEXT: s_endpgm
;
+; GFX1032DAGISEL-LABEL: uniform_value_float:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_loaLowering intrinsic: 3656
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1032DAGISEL-NEXT: d_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
; GFX1164DAGISEL-LABEL: uniform_value_float:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
@@ -125,7 +800,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT: v_dual_mov_bLowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: 32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12DAGISEL-NEXT: s_endpgm
entry:
@@ -140,7 +819,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -148,7 +827,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-NEXT: v_max_f32_e32 v3, s6, v3
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
@@ -160,7 +839,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -168,7 +847,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8GISEL-NEXT: v_max_f32_e32 v3, s6, v3
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
@@ -180,7 +859,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -188,7 +867,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9DAGISEL-NEXT: v_max_f32_e32 v3, s6, v3
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -200,7 +879,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -208,7 +887,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9GISEL-NEXT: v_max_f32_e32 v3, s6, v3
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -220,14 +899,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064DAGISEL-NEXT: v_max_f32_e64 v3, s6, s8
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -238,14 +917,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064GISEL-NEXT: v_max_f32_e64 v3, s6, s8
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -256,14 +935,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7fc00000
-; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032DAGISEL-NEXT: v_max_f32_e64 v3, s5, s7
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -274,14 +953,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7fc00000
-; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032GISEL-NEXT: v_max_f32_e64 v3, s5, s7
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -292,7 +971,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -301,7 +980,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164DAGISEL-NEXT: v_max_f32_e64 v3, s2, s4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -312,7 +991,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -321,7 +1000,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164GISEL-NEXT: v_max_f32_e64 v3, s2, s4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -332,7 +1011,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7fc00000
-; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -341,7 +1020,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132DAGISEL-NEXT: v_max_f32_e64 v3, s1, s3
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -352,7 +1031,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7fc00000
-; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -361,7 +1040,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132GISEL-NEXT: v_max_f32_e64 v3, s1, s3
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -376,7 +1055,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7fc00000
-; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -387,7 +1066,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX12DAGISEL-NEXT: v_max_num_f32_e64 v3, s1, s3
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1
@@ -420,7 +1099,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-NEXT: s_nop 1
; GFX8DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
-; GFX8DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: max_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v3, 63
; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -482,7 +1163,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX9DAGISEL-NEXT: s_nop 1
; GFX9DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: s_nop 1
-; GFX9DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_max_f32_dpp v3, v3, Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: v3 row_bcast:31 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v3, 63
; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -542,7 +1225,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v5, 32, v5
; GFX1064DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v5, 4, v5
-; GFX1064DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_max_f32_Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_max_f32_e32 v3, v3, v4
@@ -622,7 +1307,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v3, 31
; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
-; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: global_stoLowering intrinsic: 3656
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1032DAGISEL-NEXT: re_dword v[0:1], v2, off
; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
; GFX1032DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX1032DAGISEL-NEXT: buffer_load_dword v3, off, s[0:3], s32
@@ -662,49 +1349,51 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_value_float_dpp:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v3, s32
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:8
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0
-; GFX1164DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
-; GFX1164DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v5, 32, v5
-; GFX1164DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v5, 4, v5
-; GFX1164DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v4, v5, v3
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v3, 63
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX1164DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v3, off, s32
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:4
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:8
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_float_dpp:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x2 ; 12-byte Folded Spill
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v5, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: ) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_nc_u32_e32 v5, 32, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_lo_u32 v5, 4, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v4, v5, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s2, v3, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x2 ; 12-byte Folded Reload
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v5, off, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_value_float_dpp:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -750,37 +1439,39 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_float_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v3, s32
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s0
-; GFX1132DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_max_f32_e32 v3, v3, v4
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v3, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v3, off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_float_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -814,6 +1505,86 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_float_dpp:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x2 ; 12-byte Folded Spill
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v5, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CLowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: YCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_nc_u32_e32 v5, 32, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_lo_u32 v5, 4, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v4, v5, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s2, v3, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x2 ; 12-byte Folded Reload
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v5, off, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: , s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_float_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -833,7 +1604,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_max_num_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: v_max_num_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: (VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_max_num_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: v_max_num_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
@@ -923,7 +1696,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 ofLowering intrinsic: 3713
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX8DAGISEL-NEXT: fset:12 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
@@ -1075,7 +1857,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_loadLowering intrinsic: 3713
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX9DAGISEL-NEXT: _dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
@@ -1219,7 +2010,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX1064DAGISEL-NEXT: s_clause 0x7 ; 32-byte Folded Reload
+; GFX1064DAGISEL-NEXT: s_clause 0x7 Lowering intrinsic: 3713
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX1064DAGISEL-NEXT: ; 32-byte Folded Reload
; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
@@ -1411,81 +2211,86 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Spill
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
-; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v6, 32, v6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v6, 4, v6
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v6, v4
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v8, v6, v5
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[7:8]
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Reload
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_maxLowering intrinsic: 3713
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: _f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_value_double_dpp:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -1563,59 +2368,68 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_Lowering intrinsic: 3713
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-FAKE16-NEXT: mov_b32 exec_lo, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_double_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -1671,6 +2485,150 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3713
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v5, Lowering intrinsic: 3713
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1132DAGISEL-TRUE16-NEXT: 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_double_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1720,7 +2678,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
; GFX12DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT: v_dual_Lowering intrinsic: 3713
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3656
+; GFX12DAGISEL-NEXT: mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Reload
@@ -1745,11 +2712,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1757,17 +2724,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8DAGISEL-NEXT: v_max_f32_e32 v3, s8, v3
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX8DAGISEL-NEXT: ; %bb.3:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX8DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX8DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX8DAGISEL-NEXT: ; %bb.5: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1775,10 +2742,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8DAGISEL-NEXT: v_max_f32_e32 v2, s8, v2
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.7:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v4
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1792,11 +2759,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8GISEL-NEXT: ; implicit-def: $vgpr4
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1804,17 +2771,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8GISEL-NEXT: v_max_f32_e32 v3, s8, v3
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX8GISEL-NEXT: ; %bb.3:
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX8GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_4: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX8GISEL-NEXT: ; %bb.5: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1822,10 +2789,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8GISEL-NEXT: v_max_f32_e32 v2, s8, v2
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX8GISEL-NEXT: ; %bb.7:
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX8GISEL-NEXT: .LBB4_8: ; %endif
+; GFX8GISEL-NEXT: .LBB6_8: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v4
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1839,11 +2806,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1851,17 +2818,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9DAGISEL-NEXT: v_max_f32_e32 v3, s8, v3
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX9DAGISEL-NEXT: ; %bb.3:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX9DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX9DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX9DAGISEL-NEXT: ; %bb.5: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1869,10 +2836,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9DAGISEL-NEXT: v_max_f32_e32 v2, s8, v2
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.7:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1886,11 +2853,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9GISEL-NEXT: ; implicit-def: $vgpr4
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1898,17 +2865,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9GISEL-NEXT: v_max_f32_e32 v3, s8, v3
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX9GISEL-NEXT: ; %bb.3:
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX9GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_4: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX9GISEL-NEXT: ; %bb.5: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1916,10 +2883,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9GISEL-NEXT: v_max_f32_e32 v2, s8, v2
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX9GISEL-NEXT: ; %bb.7:
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9GISEL-NEXT: .LBB4_8: ; %endif
+; GFX9GISEL-NEXT: .LBB6_8: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9GISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1933,38 +2900,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064DAGISEL-NEXT: v_max_f32_e64 v3, s8, s10
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1064DAGISEL-NEXT: ; %bb.3:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1064DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX1064DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1064DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064DAGISEL-NEXT: v_max_f32_e64 v2, s8, s10
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.7:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1977,38 +2944,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064GISEL-NEXT: v_max_f32_e64 v3, s8, s10
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.3:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1064GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1064GISEL-NEXT: ; %bb.5: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064GISEL-NEXT: v_max_f32_e64 v2, s8, s10
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1064GISEL-NEXT: ; %bb.7:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX1064GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1064GISEL-NEXT: .LBB6_8: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064GISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2021,38 +2988,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032DAGISEL-NEXT: v_max_f32_e64 v3, s6, s8
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1032DAGISEL-NEXT: ; %bb.3:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1032DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX1032DAGISEL-NEXT: s_andn2_saveexec_b32 s4, s4
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1032DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v3, s7
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032DAGISEL-NEXT: v_max_f32_e64 v2, s6, s8
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s6, v2
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.7:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2065,90 +3032,96 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032GISEL-NEXT: v_max_f32_e64 v3, s6, s8
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.3:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1032GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s4, s4
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1032GISEL-NEXT: ; %bb.5: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v3, s7
; GFX1032GISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032GISEL-NEXT: v_max_f32_e64 v2, s6, s8
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s6, v2
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1032GISEL-NEXT: ; %bb.7:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s6
-; GFX1032GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1032GISEL-NEXT: .LBB6_8: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032GISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v31
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1164DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_4
-; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v2, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: v_max_f32_e64 v3, s4, s6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
-; GFX1164DAGISEL-NEXT: ; %bb.3:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1164DAGISEL-NEXT: .LBB4_4: ; %Flow
-; GFX1164DAGISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_8
-; GFX1164DAGISEL-NEXT: ; %bb.5: ; %if
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v3, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: v_max_f32_e64 v2, s4, s6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
-; GFX1164DAGISEL-NEXT: ; %bb.7:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT: .LBB4_8: ; %endif
-; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v4, off
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX1164DAGISEL-FAKE16-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v2, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_e64 v3, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.3:
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-FAKE16-NEXT: ; implicit-def: $vgpr3
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_4: ; %Flow
+; GFX1164DAGISEL-FAKE16-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_8
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.5: ; %if
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v3, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_max_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu inLowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-FAKE16-NEXT: stid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB6_6
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.7:
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_8: ; %endif
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_cfg_float:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -2159,11 +3132,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1164GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v2, s5
@@ -2172,17 +3145,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1164GISEL-NEXT: v_max_f32_e64 v3, s4, s6
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.3:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1164GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1164GISEL-NEXT: ; %bb.5: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v3, s5
@@ -2191,10 +3164,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1164GISEL-NEXT: v_max_f32_e64 v2, s4, s6
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1164GISEL-NEXT: ; %bb.7:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s4
-; GFX1164GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1164GISEL-NEXT: .LBB6_8: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2208,11 +3181,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2221,17 +3194,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132DAGISEL-NEXT: v_max_f32_e64 v3, s2, s4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1132DAGISEL-NEXT: ; %bb.3:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1132DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX1132DAGISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1132DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v3, s3
@@ -2240,10 +3213,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132DAGISEL-NEXT: v_max_f32_e64 v2, s2, s4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.7:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2257,11 +3230,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2270,17 +3243,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132GISEL-NEXT: v_max_f32_e64 v3, s2, s4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.3:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1132GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1132GISEL-NEXT: ; %bb.5: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v3, s3
@@ -2289,14 +3262,69 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132GISEL-NEXT: v_max_f32_e64 v2, s2, s4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1132GISEL-NEXT: ; %bb.7:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX1132GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1132GISEL-NEXT: .LBB6_8: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX1164DAGISEL-TRUE16-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v2, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_e64 v3, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.3:
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-TRUE16-NEXT: ; implicit-def: $vgpr3
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_4: ; %Flow
+; GFX1164DAGISEL-TRUE16-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_8
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.5: ; %if
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v3, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_max_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3656
+; GFX1164DAGISEL-TRUE16-NEXT: alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB6_6
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.7:
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_8: ; %endif
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_cfg_float:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2311,11 +3339,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX12DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2326,19 +3354,19 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX12DAGISEL-NEXT: v_max_num_f32_e64 v3, s2, s4
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX12DAGISEL-NEXT: ; %bb.3:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX12DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX12DAGISEL-NEXT: ; %bb.5: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2349,11 +3377,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX12DAGISEL-NEXT: v_max_num_f32_e64 v2, s2, s4
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX12DAGISEL-NEXT: ; %bb.7:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX12DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX12DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: global_store_b32 v[0:1], v4, off
@@ -2378,17 +3406,6 @@ endif:
}
define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %in) #0 {
-; GFX8DAGISEL-LABEL: uniform_value_double:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; GFX8DAGISEL-NEXT: s_endpgm
-;
; GFX8GISEL-LABEL: uniform_value_double:
; GFX8GISEL: ; %bb.0: ; %entry
; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2400,16 +3417,6 @@ define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8GISEL-NEXT: s_endpgm
;
-; GFX9DAGISEL-LABEL: uniform_value_double:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
; GFX9GISEL-LABEL: uniform_value_double:
; GFX9GISEL: ; %bb.0: ; %entry
; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2420,16 +3427,6 @@ define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9GISEL-NEXT: s_endpgm
;
-; GFX10DAGISEL-LABEL: uniform_value_double:
-; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10DAGISEL-NEXT: s_endpgm
-;
; GFX10GISEL-LABEL: uniform_value_double:
; GFX10GISEL: ; %bb.0: ; %entry
; GFX10GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2440,6 +3437,16 @@ define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %i
; GFX10GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10GISEL-NEXT: s_endpgm
;
+; GFX1032DAGISEL-LABEL: uniform_value_double:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
; GFX1164DAGISEL-LABEL: uniform_value_double:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2499,7 +3506,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX8DAGISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2510,7 +3517,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2524,7 +3531,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
; GFX8GISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2535,7 +3542,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2549,7 +3556,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX9DAGISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2560,7 +3567,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2574,7 +3581,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
; GFX9GISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2585,7 +3592,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2599,7 +3606,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1064DAGISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2608,7 +3615,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: v_max_f64 v[4:5], s[8:9], s[6:7]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2621,7 +3628,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
; GFX1064GISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2630,7 +3637,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064GISEL-NEXT: v_max_f64 v[4:5], s[8:9], s[6:7]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2643,7 +3650,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2652,7 +3659,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032DAGISEL-NEXT: v_max_f64 v[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2665,7 +3672,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2674,7 +3681,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: v_max_f64 v[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2687,7 +3694,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0
; GFX1164DAGISEL-NEXT: s_mov_b32 s3, 0x7ff80000
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2699,7 +3706,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2712,7 +3719,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: s_mov_b32 s2, 0
; GFX1164GISEL-NEXT: s_mov_b32 s3, 0x7ff80000
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2724,7 +3731,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2737,7 +3744,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2749,7 +3756,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2761,7 +3768,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2773,7 +3780,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2789,7 +3796,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2802,7 +3809,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2823,12 +3830,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX8DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -2839,20 +3846,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX8DAGISEL-NEXT: ; %bb.3:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX8DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX8DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX8DAGISEL-NEXT: ; %bb.5: ; %if
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX8DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -2863,11 +3870,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX8DAGISEL-NEXT: ; %bb.7:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX8DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX8DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2881,12 +3888,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b32 s8, 0
; GFX8GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -2897,20 +3904,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX8GISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX8GISEL-NEXT: ; %bb.3:
; GFX8GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8GISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX8GISEL-NEXT: ; implicit-def: $vgpr4
; GFX8GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX8GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_4: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX8GISEL-NEXT: ; %bb.5: ; %if
; GFX8GISEL-NEXT: s_mov_b32 s8, 0
; GFX8GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -2921,11 +3928,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX8GISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX8GISEL-NEXT: ; %bb.7:
; GFX8GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8GISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX8GISEL-NEXT: .LBB7_8: ; %endif
+; GFX8GISEL-NEXT: .LBB9_8: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2939,12 +3946,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX9DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -2955,20 +3962,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX9DAGISEL-NEXT: ; %bb.3:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX9DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX9DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX9DAGISEL-NEXT: ; %bb.5: ; %if
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX9DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -2979,11 +3986,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX9DAGISEL-NEXT: ; %bb.7:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX9DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX9DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2997,12 +4004,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b32 s8, 0
; GFX9GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -3013,20 +4020,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX9GISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX9GISEL-NEXT: ; %bb.3:
; GFX9GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9GISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX9GISEL-NEXT: ; implicit-def: $vgpr4
; GFX9GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX9GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_4: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX9GISEL-NEXT: ; %bb.5: ; %if
; GFX9GISEL-NEXT: s_mov_b32 s8, 0
; GFX9GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -3037,11 +4044,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX9GISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX9GISEL-NEXT: ; %bb.7:
; GFX9GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9GISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX9GISEL-NEXT: .LBB7_8: ; %endif
+; GFX9GISEL-NEXT: .LBB9_8: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3055,12 +4062,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX1064DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v2, s12
; GFX1064DAGISEL-NEXT: v_readlane_b32 s11, v3, s12
@@ -3069,20 +4076,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064DAGISEL-NEXT: v_max_f64 v[4:5], s[10:11], s[8:9]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1064DAGISEL-NEXT: ; %bb.3:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1064DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1064DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1064DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX1064DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v4, s12
; GFX1064DAGISEL-NEXT: v_readlane_b32 s11, v5, s12
@@ -3091,11 +4098,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064DAGISEL-NEXT: v_max_f64 v[2:3], s[10:11], s[8:9]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1064DAGISEL-NEXT: ; %bb.7:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX1064DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3108,12 +4115,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0
; GFX1064GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v2, s12
; GFX1064GISEL-NEXT: v_readlane_b32 s11, v3, s12
@@ -3122,20 +4129,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064GISEL-NEXT: v_max_f64 v[4:5], s[10:11], s[8:9]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.3:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1064GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1064GISEL-NEXT: ; %bb.5: ; %if
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0
; GFX1064GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v4, s12
; GFX1064GISEL-NEXT: v_readlane_b32 s11, v5, s12
@@ -3144,11 +4151,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064GISEL-NEXT: v_max_f64 v[2:3], s[10:11], s[8:9]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1064GISEL-NEXT: ; %bb.7:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX1064GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_8: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3161,12 +4168,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s6, exec_lo, s4
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032DAGISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -3175,20 +4182,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032DAGISEL-NEXT: v_max_f64 v[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1032DAGISEL-NEXT: ; %bb.3:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1032DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1032DAGISEL-NEXT: s_andn2_saveexec_b32 s6, s6
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1032DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032DAGISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v4, s10
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v5, s10
@@ -3197,11 +4204,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032DAGISEL-NEXT: v_max_f64 v[2:3], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1032DAGISEL-NEXT: ; %bb.7:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1032DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3214,12 +4221,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s6, exec_lo, s4
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032GISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -3228,20 +4235,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032GISEL-NEXT: v_max_f64 v[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.3:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1032GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s6, s6
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1032GISEL-NEXT: ; %bb.5: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032GISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v4, s10
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v5, s10
@@ -3250,11 +4257,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032GISEL-NEXT: v_max_f64 v[2:3], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1032GISEL-NEXT: ; %bb.7:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1032GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_8: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3268,12 +4275,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1164DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v2, s8
@@ -3285,20 +4292,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1164DAGISEL-NEXT: ; %bb.3:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1164DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1164DAGISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1164DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v4, s8
@@ -3310,11 +4317,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1164DAGISEL-NEXT: ; %bb.7:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3328,12 +4335,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1164GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v2, s8
@@ -3345,20 +4352,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.3:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1164GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1164GISEL-NEXT: ; %bb.5: ; %if
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v4, s8
@@ -3370,11 +4377,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1164GISEL-NEXT: ; %bb.7:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_8: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3388,12 +4395,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1132DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -3405,19 +4412,19 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1132DAGISEL-NEXT: ; %bb.3:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1132DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1132DAGISEL-NEXT: s_and_not1_saveexec_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1132DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v4, s6
@@ -3429,10 +4436,10 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1132DAGISEL-NEXT: ; %bb.7:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX1132DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3446,12 +4453,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1132GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132GISEL-NEXT: s_xor_b32 s2, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -3463,19 +4470,19 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.3:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1132GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1132GISEL-NEXT: ; %bb.5: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v4, s6
@@ -3487,10 +4494,10 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1132GISEL-NEXT: ; %bb.7:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX1132GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_8: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3509,12 +4516,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX12DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3527,21 +4534,21 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX12DAGISEL-NEXT: ; %bb.3:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX12DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_and_not1_saveexec_b32 s2, s2
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX12DAGISEL-NEXT: ; %bb.5: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3554,11 +4561,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX12DAGISEL-NEXT: ; %bb.7:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX12DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
@@ -3583,3 +4590,5 @@ endif:
}
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
index bbe672f428741..34645be5cc801 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fmin.ll
@@ -7,46 +7,702 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
-
-define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) #0 {
-; GFX8DAGISEL-LABEL: uniform_value_float:
+define amdgpu_kernel void @uniform_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: uniform_value_half:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: v_min_f32_e32 v1, s4, v1
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
-; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL-LABEL: uniform_value_half:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s2
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
-; GFX9DAGISEL-LABEL: uniform_value_float:
+; GFX9DAGISEL-LABEL: uniform_value_half:
; GFX9DAGISEL: ; %bb.0: ; %entry
; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: v_min_f32_e32 v2, s4, v2
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9DAGISEL-NEXT: s_endpgm
;
+; GFX9GISEL-LABEL: uniform_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: v_min_f32_e64 v2, s4, s6
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_half:
+; GFX10GISEL: ; %bb.0: ; %entry
+; GFX10GISEL-NEXT: s_clause 0x1
+; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX10GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX10GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0x7fc00000
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s4
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: v_min_f32_e64 v2, s3, s5
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s3, 0x7fc00000
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s4
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s3, 0x7fc00000
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s3
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_cvt_f32_f16 s2, s2
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX12DAGISEL-NEXT: s_cvt_f16_f32 s2, s2
+; GFX12DAGISEL-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX12DAGISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT: s_endpgm
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fmin(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: divergent_value_half:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: v_min_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_half:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: v_min_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_half:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: v_min_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: v_min_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: v_min_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_half:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
+; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: v_min_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7fc00000
+; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: v_min_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_half:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7fc00000
+; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: v_min_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-FAKE16-NEXT: v_min_f32_e64 v3, s2, s4
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-FAKE16-NEXT: v_min_f32_e64 v3, s1, s3
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b32 s2, 0x7fc00000
+; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-TRUE16-NEXT: v_min_f32_e64 v3, s2, s4
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-TRUE16-NEXT: v_min_f32_e64 v3, s1, s3
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7fc00000
+; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT: v_min_num_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: ; %bb.2:
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX12DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fmin(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
+
+define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) #0 {
+; GFX8GISEL-LABEL: uniform_value_float:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
; GFX9GISEL-LABEL: uniform_value_float:
; GFX9GISEL: ; %bb.0: ; %entry
; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
@@ -57,16 +713,20 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9GISEL-NEXT: s_endpgm
;
-; GFX10DAGISEL-LABEL: uniform_value_float:
-; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_clause 0x1
-; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10DAGISEL-NEXT: s_endpgm
+; GFX1064DAGISEL-LABEL: uniform_value_float:
+; GFX1064DAGISEL: ; %bb.0: ; %Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: uniform_value_float:
; GFX10GISEL: ; %bb.0: ; %entry
@@ -79,6 +739,21 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10GISEL-NEXT: s_endpgm
;
+; GFX1032DAGISEL-LABEL: uniform_value_float:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_loaLowering intrinsic: 3657
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1032DAGISEL-NEXT: d_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
; GFX1164DAGISEL-LABEL: uniform_value_float:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
@@ -125,7 +800,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12DAGISEL-NEXT: v_dual_mov_bLowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: 32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12DAGISEL-NEXT: s_endpgm
entry:
@@ -140,7 +819,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -148,7 +827,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-NEXT: v_min_f32_e32 v3, s6, v3
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
@@ -160,7 +839,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -168,7 +847,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8GISEL-NEXT: v_min_f32_e32 v3, s6, v3
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
@@ -180,7 +859,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -188,7 +867,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9DAGISEL-NEXT: v_min_f32_e32 v3, s6, v3
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -200,7 +879,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -208,7 +887,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX9GISEL-NEXT: v_min_f32_e32 v3, s6, v3
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -220,14 +899,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064DAGISEL-NEXT: v_min_f32_e64 v3, s6, s8
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -238,14 +917,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064GISEL-NEXT: v_min_f32_e64 v3, s6, s8
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -256,14 +935,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7fc00000
-; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032DAGISEL-NEXT: v_min_f32_e64 v3, s5, s7
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -274,14 +953,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7fc00000
-; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032GISEL-NEXT: v_min_f32_e64 v3, s5, s7
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -292,7 +971,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -301,7 +980,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164DAGISEL-NEXT: v_min_f32_e64 v3, s2, s4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -312,7 +991,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -321,7 +1000,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1164GISEL-NEXT: v_min_f32_e64 v3, s2, s4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -332,7 +1011,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7fc00000
-; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -341,7 +1020,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132DAGISEL-NEXT: v_min_f32_e64 v3, s1, s3
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -352,7 +1031,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7fc00000
-; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -361,7 +1040,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX1132GISEL-NEXT: v_min_f32_e64 v3, s1, s3
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -376,7 +1055,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7fc00000
-; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -387,7 +1066,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX12DAGISEL-NEXT: v_min_num_f32_e64 v3, s1, s3
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1
@@ -420,7 +1099,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-NEXT: s_nop 1
; GFX8DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
-; GFX8DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: min_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v3, 63
; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -482,7 +1163,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX9DAGISEL-NEXT: s_nop 1
; GFX9DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: s_nop 1
-; GFX9DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_min_f32_dpp v3, v3, Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: v3 row_bcast:31 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v3, 63
; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
@@ -542,7 +1225,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v5, 32, v5
; GFX1064DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v5, 4, v5
-; GFX1064DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_min_f32_Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_min_f32_e32 v3, v3, v4
@@ -622,7 +1307,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v3, 31
; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
-; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: global_stoLowering intrinsic: 3657
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1032DAGISEL-NEXT: re_dword v[0:1], v2, off
; GFX1032DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1
; GFX1032DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
; GFX1032DAGISEL-NEXT: buffer_load_dword v3, off, s[0:3], s32
@@ -662,49 +1349,51 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_value_float_dpp:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v3, s32
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:8
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0
-; GFX1164DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
-; GFX1164DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v5, 32, v5
-; GFX1164DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v5, 4, v5
-; GFX1164DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v4, v5, v3
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v3, 63
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX1164DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Reload
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v3, off, s32
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:4
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:8
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_float_dpp:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x2 ; 12-byte Folded Spill
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v5, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: ) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_nc_u32_e32 v5, 32, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_lo_u32 v5, 4, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v4, v5, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s2, v3, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x2 ; 12-byte Folded Reload
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v5, off, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_value_float_dpp:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -750,37 +1439,39 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_float_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v3, s32
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s0
-; GFX1132DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_min_f32_e32 v3, v3, v4
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v3, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v3, off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_float_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -814,6 +1505,86 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_float_dpp:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x2 ; 12-byte Folded Spill
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v5, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CLowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: YCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v5, -1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_nc_u32_e32 v5, 32, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_lo_u32 v5, 4, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v4, v5, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s2, v3, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x2 ; 12-byte Folded Reload
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v5, off, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0x7fc00000, v2Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: , s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_float_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -833,7 +1604,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %in) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_min_num_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: v_min_num_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: (VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_min_num_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: v_min_num_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
@@ -923,7 +1696,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 ofLowering intrinsic: 3713
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX8DAGISEL-NEXT: fset:12 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
@@ -1075,7 +1857,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_loadLowering intrinsic: 3713
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX9DAGISEL-NEXT: _dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
@@ -1219,7 +2010,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX1064DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX1064DAGISEL-NEXT: s_clause 0x7 ; 32-byte Folded Reload
+; GFX1064DAGISEL-NEXT: s_clause 0x7 Lowering intrinsic: 3713
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX1064DAGISEL-NEXT: ; 32-byte Folded Reload
; GFX1064DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32
; GFX1064DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4
; GFX1064DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8
@@ -1411,81 +2211,86 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Spill
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
-; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v6, 32, v6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v6, 4, v6
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v6, v4
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v8, v6, v5
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[7:8]
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Reload
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_minLowering intrinsic: 3713
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: _f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_value_double_dpp:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -1563,59 +2368,68 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_Lowering intrinsic: 3713
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-FAKE16-NEXT: mov_b32 exec_lo, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_double_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -1671,6 +2485,150 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3713
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v5, Lowering intrinsic: 3713
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1132DAGISEL-TRUE16-NEXT: 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_double_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1720,7 +2678,16 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
; GFX12DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2
-; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12DAGISEL-NEXT: v_dual_Lowering intrinsic: 3713
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3657
+; GFX12DAGISEL-NEXT: mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
; GFX12DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Reload
@@ -1745,11 +2712,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1757,17 +2724,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8DAGISEL-NEXT: v_min_f32_e32 v3, s8, v3
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX8DAGISEL-NEXT: ; %bb.3:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX8DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX8DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX8DAGISEL-NEXT: ; %bb.5: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1775,10 +2742,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8DAGISEL-NEXT: v_min_f32_e32 v2, s8, v2
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX8DAGISEL-NEXT: ; %bb.7:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX8DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v4
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1792,11 +2759,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8GISEL-NEXT: ; implicit-def: $vgpr4
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1804,17 +2771,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8GISEL-NEXT: v_min_f32_e32 v3, s8, v3
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX8GISEL-NEXT: ; %bb.3:
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX8GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_4: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX8GISEL-NEXT: ; %bb.5: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX8GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1822,10 +2789,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX8GISEL-NEXT: v_min_f32_e32 v2, s8, v2
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX8GISEL-NEXT: ; %bb.7:
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX8GISEL-NEXT: .LBB4_8: ; %endif
+; GFX8GISEL-NEXT: .LBB6_8: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v4
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1839,11 +2806,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1851,17 +2818,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9DAGISEL-NEXT: v_min_f32_e32 v3, s8, v3
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX9DAGISEL-NEXT: ; %bb.3:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX9DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX9DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX9DAGISEL-NEXT: ; %bb.5: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1869,10 +2836,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9DAGISEL-NEXT: v_min_f32_e32 v2, s8, v2
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX9DAGISEL-NEXT: ; %bb.7:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1886,11 +2853,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9GISEL-NEXT: ; implicit-def: $vgpr4
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s10
@@ -1898,17 +2865,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9GISEL-NEXT: v_min_f32_e32 v3, s8, v3
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX9GISEL-NEXT: ; %bb.3:
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX9GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_4: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX9GISEL-NEXT: ; %bb.5: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX9GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s10
@@ -1916,10 +2883,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX9GISEL-NEXT: v_min_f32_e32 v2, s8, v2
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX9GISEL-NEXT: ; %bb.7:
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9GISEL-NEXT: .LBB4_8: ; %endif
+; GFX9GISEL-NEXT: .LBB6_8: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9GISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1933,38 +2900,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064DAGISEL-NEXT: v_min_f32_e64 v3, s8, s10
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1064DAGISEL-NEXT: ; %bb.3:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1064DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX1064DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1064DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064DAGISEL-NEXT: v_min_f32_e64 v2, s8, s10
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1064DAGISEL-NEXT: ; %bb.7:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX1064DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1977,38 +2944,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v2, s9
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064GISEL-NEXT: v_min_f32_e64 v3, s8, s10
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.3:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1064GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1064GISEL-NEXT: ; %bb.5: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0x7fc00000
-; GFX1064GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s9
; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s9
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX1064GISEL-NEXT: v_min_f32_e64 v2, s8, s10
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v2
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1064GISEL-NEXT: ; %bb.7:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s8
-; GFX1064GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1064GISEL-NEXT: .LBB6_8: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064GISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2021,38 +2988,38 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032DAGISEL-NEXT: v_min_f32_e64 v3, s6, s8
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1032DAGISEL-NEXT: ; %bb.3:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1032DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX1032DAGISEL-NEXT: s_andn2_saveexec_b32 s4, s4
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1032DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v3, s7
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032DAGISEL-NEXT: v_min_f32_e64 v2, s6, s8
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s6, v2
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1032DAGISEL-NEXT: ; %bb.7:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s6
-; GFX1032DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2065,90 +3032,96 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s4, exec_lo, s4
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032GISEL-NEXT: v_min_f32_e64 v3, s6, s8
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.3:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1032GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s4, s4
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1032GISEL-NEXT: ; %bb.5: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s5, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s6, 0x7fc00000
-; GFX1032GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s5
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v3, s7
; GFX1032GISEL-NEXT: s_bitset0_b32 s5, s7
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX1032GISEL-NEXT: v_min_f32_e64 v2, s6, s8
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s6, v2
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1032GISEL-NEXT: ; %bb.7:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s6
-; GFX1032GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1032GISEL-NEXT: .LBB6_8: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032GISEL-NEXT: global_store_dword v[0:1], v4, off
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v31
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v4
-; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr4
-; GFX1164DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_4
-; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v2, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: v_min_f32_e64 v3, s4, s6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
-; GFX1164DAGISEL-NEXT: ; %bb.3:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1164DAGISEL-NEXT: .LBB4_4: ; %Flow
-; GFX1164DAGISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_8
-; GFX1164DAGISEL-NEXT: ; %bb.5: ; %if
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v3, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT: v_min_f32_e64 v2, s4, s6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
-; GFX1164DAGISEL-NEXT: ; %bb.7:
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4
-; GFX1164DAGISEL-NEXT: .LBB4_8: ; %endif
-; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v4, off
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ; implicit-def: $vgpr4
+; GFX1164DAGISEL-FAKE16-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v2, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_e64 v3, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.3:
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-FAKE16-NEXT: ; implicit-def: $vgpr3
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_4: ; %Flow
+; GFX1164DAGISEL-FAKE16-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_8
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.5: ; %if
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v3, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_min_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu inLowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-FAKE16-NEXT: stid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB6_6
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.7:
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_8: ; %endif
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_cfg_float:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -2159,11 +3132,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1164GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v2, s5
@@ -2172,17 +3145,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1164GISEL-NEXT: v_min_f32_e64 v3, s4, s6
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.3:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s4
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1164GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1164GISEL-NEXT: ; %bb.5: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0x7fc00000
-; GFX1164GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v3, s5
@@ -2191,10 +3164,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1164GISEL-NEXT: v_min_f32_e64 v2, s4, s6
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v2
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1164GISEL-NEXT: ; %bb.7:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s4
-; GFX1164GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1164GISEL-NEXT: .LBB6_8: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2208,11 +3181,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2221,17 +3194,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132DAGISEL-NEXT: v_min_f32_e64 v3, s2, s4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1132DAGISEL-NEXT: ; %bb.3:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1132DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX1132DAGISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1132DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v3, s3
@@ -2240,10 +3213,10 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132DAGISEL-NEXT: v_min_f32_e64 v2, s2, s4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1132DAGISEL-NEXT: ; %bb.7:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX1132DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2257,11 +3230,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132GISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2270,17 +3243,17 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132GISEL-NEXT: v_min_f32_e64 v3, s2, s4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.3:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr3
-; GFX1132GISEL-NEXT: .LBB4_4: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_4: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX1132GISEL-NEXT: ; %bb.5: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX1132GISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v3, s3
@@ -2289,14 +3262,69 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX1132GISEL-NEXT: v_min_f32_e64 v2, s2, s4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX1132GISEL-NEXT: ; %bb.7:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX1132GISEL-NEXT: .LBB4_8: ; %endif
+; GFX1132GISEL-NEXT: .LBB6_8: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v4, off
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0x3ff, v31
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cmp_lt_u32_e32 vcc, 15, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ; implicit-def: $vgpr4
+; GFX1164DAGISEL-TRUE16-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v2, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_e64 v3, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.3:
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-TRUE16-NEXT: ; implicit-def: $vgpr3
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_4: ; %Flow
+; GFX1164DAGISEL-TRUE16-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_8
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.5: ; %if
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0x7fc00000
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v3, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_min_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3657
+; GFX1164DAGISEL-TRUE16-NEXT: alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB6_6
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.7:
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v4, s4
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_8: ; %endif
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v4, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_cfg_float:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2311,11 +3339,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX12DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT: .LBB4_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2326,19 +3354,19 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX12DAGISEL-NEXT: v_min_num_f32_e64 v3, s2, s4
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_2
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_2
; GFX12DAGISEL-NEXT: ; %bb.3:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v4, s2
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr3
-; GFX12DAGISEL-NEXT: .LBB4_4: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB6_4: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_8
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_8
; GFX12DAGISEL-NEXT: ; %bb.5: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s1, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0x7fc00000
-; GFX12DAGISEL-NEXT: .LBB4_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB6_6: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2349,11 +3377,11 @@ define void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) #
; GFX12DAGISEL-NEXT: v_min_num_f32_e64 v2, s2, s4
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v2
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_6
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_6
; GFX12DAGISEL-NEXT: ; %bb.7:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v4, s2
-; GFX12DAGISEL-NEXT: .LBB4_8: ; %endif
+; GFX12DAGISEL-NEXT: .LBB6_8: ; %endif
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: global_store_b32 v[0:1], v4, off
@@ -2378,17 +3406,6 @@ endif:
}
define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %in) #0 {
-; GFX8DAGISEL-LABEL: uniform_value_double:
-; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
-; GFX8DAGISEL-NEXT: s_endpgm
-;
; GFX8GISEL-LABEL: uniform_value_double:
; GFX8GISEL: ; %bb.0: ; %entry
; GFX8GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2400,16 +3417,6 @@ define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8GISEL-NEXT: s_endpgm
;
-; GFX9DAGISEL-LABEL: uniform_value_double:
-; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s3
-; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9DAGISEL-NEXT: s_endpgm
-;
; GFX9GISEL-LABEL: uniform_value_double:
; GFX9GISEL: ; %bb.0: ; %entry
; GFX9GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2420,16 +3427,6 @@ define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9GISEL-NEXT: s_endpgm
;
-; GFX10DAGISEL-LABEL: uniform_value_double:
-; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s3
-; GFX10DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10DAGISEL-NEXT: s_endpgm
-;
; GFX10GISEL-LABEL: uniform_value_double:
; GFX10GISEL: ; %bb.0: ; %entry
; GFX10GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2440,6 +3437,16 @@ define amdgpu_kernel void @uniform_value_double(ptr addrspace(1) %out, double %i
; GFX10GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10GISEL-NEXT: s_endpgm
;
+; GFX1032DAGISEL-LABEL: uniform_value_double:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
; GFX1164DAGISEL-LABEL: uniform_value_double:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2499,7 +3506,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX8DAGISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2510,7 +3517,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2524,7 +3531,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
; GFX8GISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2535,7 +3542,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2549,7 +3556,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX9DAGISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2560,7 +3567,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2574,7 +3581,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
; GFX9GISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2585,7 +3592,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2599,7 +3606,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
; GFX1064DAGISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2608,7 +3615,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: v_min_f64 v[4:5], s[8:9], s[6:7]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2621,7 +3628,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
; GFX1064GISEL-NEXT: s_mov_b32 s7, 0x7ff80000
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2630,7 +3637,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1064GISEL-NEXT: v_min_f64 v[4:5], s[8:9], s[6:7]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2643,7 +3650,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2652,7 +3659,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032DAGISEL-NEXT: v_min_f64 v[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2665,7 +3672,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2674,7 +3681,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: v_min_f64 v[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2687,7 +3694,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0
; GFX1164DAGISEL-NEXT: s_mov_b32 s3, 0x7ff80000
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2699,7 +3706,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2712,7 +3719,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: s_mov_b32 s2, 0
; GFX1164GISEL-NEXT: s_mov_b32 s3, 0x7ff80000
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2724,7 +3731,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2737,7 +3744,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2749,7 +3756,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2761,7 +3768,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2773,7 +3780,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2789,7 +3796,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2802,7 +3809,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
@@ -2823,12 +3830,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX8DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -2839,20 +3846,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX8DAGISEL-NEXT: ; %bb.3:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX8DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX8DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX8DAGISEL-NEXT: ; %bb.5: ; %if
; GFX8DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX8DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -2863,11 +3870,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX8DAGISEL-NEXT: ; %bb.7:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX8DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX8DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2881,12 +3888,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b32 s8, 0
; GFX8GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -2897,20 +3904,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX8GISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX8GISEL-NEXT: ; %bb.3:
; GFX8GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8GISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX8GISEL-NEXT: ; implicit-def: $vgpr4
; GFX8GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX8GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_4: ; %Flow
; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX8GISEL-NEXT: ; %bb.5: ; %if
; GFX8GISEL-NEXT: s_mov_b32 s8, 0
; GFX8GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX8GISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -2921,11 +3928,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX8GISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX8GISEL-NEXT: ; %bb.7:
; GFX8GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX8GISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX8GISEL-NEXT: .LBB7_8: ; %endif
+; GFX8GISEL-NEXT: .LBB9_8: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8GISEL-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2939,12 +3946,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX9DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -2955,20 +3962,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX9DAGISEL-NEXT: ; %bb.3:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX9DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX9DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX9DAGISEL-NEXT: ; %bb.5: ; %if
; GFX9DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX9DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -2979,11 +3986,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX9DAGISEL-NEXT: ; %bb.7:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX9DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX9DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2997,12 +4004,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b32 s8, 0
; GFX9GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v2, s12
@@ -3013,20 +4020,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX9GISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX9GISEL-NEXT: ; %bb.3:
; GFX9GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9GISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX9GISEL-NEXT: ; implicit-def: $vgpr4
; GFX9GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX9GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_4: ; %Flow
; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX9GISEL-NEXT: ; %bb.5: ; %if
; GFX9GISEL-NEXT: s_mov_b32 s8, 0
; GFX9GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s8
; GFX9GISEL-NEXT: v_readlane_b32 s10, v4, s12
@@ -3037,11 +4044,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX9GISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX9GISEL-NEXT: ; %bb.7:
; GFX9GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX9GISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX9GISEL-NEXT: .LBB7_8: ; %endif
+; GFX9GISEL-NEXT: .LBB9_8: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3055,12 +4062,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX1064DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v2, s12
; GFX1064DAGISEL-NEXT: v_readlane_b32 s11, v3, s12
@@ -3069,20 +4076,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064DAGISEL-NEXT: v_min_f64 v[4:5], s[10:11], s[8:9]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1064DAGISEL-NEXT: ; %bb.3:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1064DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1064DAGISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1064DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b32 s8, 0
; GFX1064DAGISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v4, s12
; GFX1064DAGISEL-NEXT: v_readlane_b32 s11, v5, s12
@@ -3091,11 +4098,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064DAGISEL-NEXT: v_min_f64 v[2:3], s[10:11], s[8:9]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1064DAGISEL-NEXT: ; %bb.7:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX1064DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3108,12 +4115,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0
; GFX1064GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v2, s12
; GFX1064GISEL-NEXT: v_readlane_b32 s11, v3, s12
@@ -3122,20 +4129,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064GISEL-NEXT: v_min_f64 v[4:5], s[10:11], s[8:9]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v4
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s9, v5
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.3:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, s9
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1064GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1064GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1064GISEL-NEXT: ; %bb.5: ; %if
; GFX1064GISEL-NEXT: s_mov_b32 s8, 0
; GFX1064GISEL-NEXT: s_mov_b32 s9, 0x7ff80000
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7]
; GFX1064GISEL-NEXT: v_readlane_b32 s10, v4, s12
; GFX1064GISEL-NEXT: v_readlane_b32 s11, v5, s12
@@ -3144,11 +4151,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1064GISEL-NEXT: v_min_f64 v[2:3], s[10:11], s[8:9]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v2
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s9, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1064GISEL-NEXT: ; %bb.7:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v6, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v7, s9
-; GFX1064GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_8: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3161,12 +4168,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s6, exec_lo, s4
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032DAGISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -3175,20 +4182,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032DAGISEL-NEXT: v_min_f64 v[4:5], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1032DAGISEL-NEXT: ; %bb.3:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1032DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1032DAGISEL-NEXT: s_andn2_saveexec_b32 s6, s6
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1032DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032DAGISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v4, s10
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v5, s10
@@ -3197,11 +4204,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032DAGISEL-NEXT: v_min_f64 v[2:3], s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1032DAGISEL-NEXT: ; %bb.7:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1032DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1032DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3214,12 +4221,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s6, exec_lo, s4
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032GISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -3228,20 +4235,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032GISEL-NEXT: v_min_f64 v[4:5], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.3:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1032GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1032GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s6, s6
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1032GISEL-NEXT: ; %bb.5: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s4, 0
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1032GISEL-NEXT: s_mov_b32 s7, exec_lo
-; GFX1032GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s10, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v4, s10
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v5, s10
@@ -3250,11 +4257,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1032GISEL-NEXT: v_min_f64 v[2:3], s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1032GISEL-NEXT: ; %bb.7:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1032GISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1032GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_8: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s6
; GFX1032GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3268,12 +4275,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1164DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v2, s8
@@ -3285,20 +4292,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1164DAGISEL-NEXT: ; %bb.3:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1164DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1164DAGISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1164DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v4, s8
@@ -3310,11 +4317,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1164DAGISEL-NEXT: ; %bb.7:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3328,12 +4335,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1164GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v2, s8
@@ -3345,20 +4352,20 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.3:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, s4
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1164GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1164GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1164GISEL-NEXT: ; %bb.5: ; %if
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: s_mov_b32 s5, 0x7ff80000
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v4, s8
@@ -3370,11 +4377,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s4, v2
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1164GISEL-NEXT: ; %bb.7:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v7, s5
; GFX1164GISEL-NEXT: v_mov_b32_e32 v6, s4
-; GFX1164GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_8: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3388,12 +4395,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1132DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -3405,19 +4412,19 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1132DAGISEL-NEXT: ; %bb.3:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1132DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX1132DAGISEL-NEXT: s_and_not1_saveexec_b32 s2, s2
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1132DAGISEL-NEXT: ; %bb.5: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v4, s6
@@ -3429,10 +4436,10 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1132DAGISEL-NEXT: ; %bb.7:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX1132DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3446,12 +4453,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GFX1132GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1132GISEL-NEXT: s_xor_b32 s2, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -3463,19 +4470,19 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.3:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr4
; GFX1132GISEL-NEXT: ; implicit-def: $vgpr5
-; GFX1132GISEL-NEXT: .LBB7_4: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_4: ; %Flow
; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX1132GISEL-NEXT: ; %bb.5: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v4, s6
@@ -3487,10 +4494,10 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX1132GISEL-NEXT: ; %bb.7:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX1132GISEL-NEXT: .LBB7_8: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_8: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3509,12 +4516,12 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX12DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB9_2: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3527,21 +4534,21 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_2
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB9_2
; GFX12DAGISEL-NEXT: ; %bb.3:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr4
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr5
-; GFX12DAGISEL-NEXT: .LBB7_4: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB9_4: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_and_not1_saveexec_b32 s2, s2
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_8
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_8
; GFX12DAGISEL-NEXT: ; %bb.5: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0x7ff80000
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX12DAGISEL-NEXT: .LBB7_6: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB9_6: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s6, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3554,11 +4561,11 @@ define void @divergent_cfg_double(ptr addrspace(1) %out, double %in, double %in2
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v2
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_6
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB9_6
; GFX12DAGISEL-NEXT: ; %bb.7:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12DAGISEL-NEXT: .LBB7_8: ; %endif
+; GFX12DAGISEL-NEXT: .LBB9_8: ; %endif
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
@@ -3583,3 +4590,5 @@ endif:
}
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10DAGISEL: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
index 22ff4f6063177..1423ae8f017eb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.fsub.ll
@@ -7,16 +7,762 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1064GISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1032DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
+define amdgpu_kernel void @uniform_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: uniform_value_half:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: v_sub_f32_e32 v1, s4, v1
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s4
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_half:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_half:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: v_sub_f32_e32 v2, s4, v2
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX9DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT: s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_clause 0x1
+; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
+; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: v_sub_f32_e64 v2, s4, s6
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1064DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_half:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
+; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1064GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_clause 0x1
+; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s3, 0
+; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032DAGISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s2
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s4
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032DAGISEL-NEXT: v_sub_f32_e64 v2, s3, s5
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1032DAGISEL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_half:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s1
+; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT: global_store_short v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_sub_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s4
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1164GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-FAKE16-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX1164GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s3, 0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_sub_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX1132GISEL-FAKE16-NEXT: s_bcnt1_i32_b32 s0, s1
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1132GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132GISEL-FAKE16-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX1132GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT: s_endpgm
+;
+; GFX1164DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s4, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s6, v1, s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_sub_f32_e64 v2, s4, s6
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s4, v2
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s4
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1164GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-TRUE16-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX1164GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132DAGISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s3, 0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s4, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s5, v1, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s2, s4
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_sub_f32_e64 v2, s3, s5
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s3, v2
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, s3
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX1132GISEL-TRUE16-NEXT: s_bcnt1_i32_b32 s0, s1
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX1132GISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132GISEL-TRUE16-NEXT: v_mul_f32_e64 v0, -s2, v1
+; GFX1132GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT: s_endpgm
+;
+; GFX12DAGISEL-LABEL: uniform_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
+; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: s_cvt_f32_f16 s2, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_2)
+; GFX12DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12DAGISEL-NEXT: s_cvt_f16_f32 s2, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX12DAGISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX12DAGISEL-NEXT: s_endpgm
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fsub(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
+define void @divergent_value_half(ptr addrspace(1) %out, half %in) {
+; GFX8DAGISEL-LABEL: divergent_value_half:
+; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT: v_sub_f32_e32 v3, s6, v3
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: ; %bb.2:
+; GFX8DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8DAGISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_half:
+; GFX8GISEL: ; %bb.0: ; %entry
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT: s_mov_b32 s6, 0
+; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT: v_sub_f32_e32 v3, s6, v3
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: ; %bb.2:
+; GFX8GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX8GISEL-NEXT: flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_half:
+; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT: v_sub_f32_e32 v3, s6, v3
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: ; %bb.2:
+; GFX9DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_half:
+; GFX9GISEL: ; %bb.0: ; %entry
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT: s_mov_b32 s6, 0
+; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
+; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT: v_sub_f32_e32 v3, s6, v3
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: ; %bb.2:
+; GFX9GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX9GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_half:
+; GFX1064DAGISEL: ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT: v_sub_f32_e64 v3, s6, s8
+; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: ; %bb.2:
+; GFX1064DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_half:
+; GFX1064GISEL: ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT: v_sub_f32_e64 v3, s6, s8
+; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: ; %bb.2:
+; GFX1064GISEL-NEXT: v_cvt_f16_f32_e32 v2, s6
+; GFX1064GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_half:
+; GFX1032DAGISEL: ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0
+; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032DAGISEL-NEXT: v_sub_f32_e64 v3, s5, s7
+; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: ; %bb.2:
+; GFX1032DAGISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032DAGISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_half:
+; GFX1032GISEL: ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s5, 0
+; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
+; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1032GISEL-NEXT: v_sub_f32_e64 v3, s5, s7
+; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: ; %bb.2:
+; GFX1032GISEL-NEXT: v_cvt_f16_f32_e32 v2, s5
+; GFX1032GISEL-NEXT: global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT: v_sub_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1164GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-FAKE16-NEXT: s_mov_b32 s2, 0
+; GFX1164GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-FAKE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-FAKE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-FAKE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-FAKE16-NEXT: v_sub_f32_e64 v3, s2, s4
+; GFX1164GISEL-FAKE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1164GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s2
+; GFX1164GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-FAKE16-NEXT: v_sub_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-FAKE16-LABEL: divergent_value_half:
+; GFX1132GISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX1132GISEL-FAKE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-FAKE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-FAKE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-FAKE16-NEXT: v_sub_f32_e64 v3, s1, s3
+; GFX1132GISEL-FAKE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-FAKE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-FAKE16-NEXT: ; %bb.2:
+; GFX1132GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, s1
+; GFX1132GISEL-FAKE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT: v_sub_f32_e64 v3, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1164GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164GISEL-TRUE16-NEXT: s_mov_b32 s2, 0
+; GFX1164GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-TRUE16-NEXT: s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT: v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-TRUE16-NEXT: s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL-TRUE16-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-TRUE16-NEXT: v_sub_f32_e64 v3, s2, s4
+; GFX1164GISEL-TRUE16-NEXT: v_readfirstlane_b32 s2, v3
+; GFX1164GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1164GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s2
+; GFX1164GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX1132DAGISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132DAGISEL-TRUE16-NEXT: v_sub_f32_e64 v3, s1, s3
+; GFX1132DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132DAGISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-TRUE16-LABEL: divergent_value_half:
+; GFX1132GISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132GISEL-TRUE16-NEXT: s_mov_b32 s1, 0
+; GFX1132GISEL-TRUE16-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-TRUE16-NEXT: s_ctz_i32_b32 s2, s0
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT: v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-TRUE16-NEXT: s_bitset0_b32 s0, s2
+; GFX1132GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132GISEL-TRUE16-NEXT: v_sub_f32_e64 v3, s1, s3
+; GFX1132GISEL-TRUE16-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1132GISEL-TRUE16-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-TRUE16-NEXT: ; %bb.2:
+; GFX1132GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX1132GISEL-TRUE16-NEXT: global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12DAGISEL-LABEL: divergent_value_half:
+; GFX12DAGISEL: ; %bb.0: ; %entry
+; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12DAGISEL-NEXT: v_cvt_f32_f16_e32 v2, v2.l
+; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
+; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
+; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX12DAGISEL-NEXT: v_sub_f32_e64 v3, s1, s3
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: ; %bb.2:
+; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
+; GFX12DAGISEL-NEXT: v_cvt_f16_f32_e32 v2.l, s1
+; GFX12DAGISEL-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+ entry:
+ %result = call half @llvm.amdgcn.wave.reduce.fsub(half %in, i32 1)
+ store half %result, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) #0 {
; GFX8DAGISEL-LABEL: uniform_value_float:
-; GFX8DAGISEL: ; %bb.0: ; %entry
+; GFX8DAGISEL: ; %bLowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: b.0: ; %entry
; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -48,7 +794,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: uniform_value_float:
-; GFX9DAGISEL: ; %bb.0: ; %entry
+; GFX9DAGISEL: ; %bb.0: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: ; %entry
; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
@@ -79,7 +829,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
;
; GFX1064DAGISEL-LABEL: uniform_value_float:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT: s_load_dwordLowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: s2, s[4:5], 0x2c
; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1]
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -110,7 +864,11 @@ define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in)
; GFX1032DAGISEL-LABEL: uniform_value_float:
; GFX1032DAGISEL: ; %bb.0: ; %entry
; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c
-; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_bLowering intrinsic: 3658
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1032DAGISEL-NEXT: 32 s0, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3
@@ -232,7 +990,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -240,7 +998,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8DAGISEL-NEXT: v_sub_f32_e32 v3, s6, v3
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
@@ -252,7 +1010,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX8GISEL-NEXT: s_mov_b32 s6, 0
-; GFX8GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -260,7 +1018,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8GISEL-NEXT: v_sub_f32_e32 v3, s6, v3
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
@@ -272,7 +1030,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -280,7 +1038,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9DAGISEL-NEXT: v_sub_f32_e32 v3, s6, v3
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -292,7 +1050,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9GISEL-NEXT: s_mov_b32 s6, 0
-; GFX9GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8
@@ -300,7 +1058,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9GISEL-NEXT: v_sub_f32_e32 v3, s6, v3
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -312,14 +1070,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064DAGISEL-NEXT: v_sub_f32_e64 v3, s6, s8
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -330,14 +1088,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064GISEL-NEXT: s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7
; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1064GISEL-NEXT: v_sub_f32_e64 v3, s6, s8
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -348,14 +1106,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0
-; GFX1032DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032DAGISEL-NEXT: v_sub_f32_e64 v3, s5, s7
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -366,14 +1124,14 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032GISEL-NEXT: s_mov_b32 s5, 0
-; GFX1032GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4
; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6
; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6
; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX1032GISEL-NEXT: v_sub_f32_e64 v3, s5, s7
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5
; GFX1032GISEL-NEXT: global_store_dword v[0:1], v2, off
@@ -384,7 +1142,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0
-; GFX1164DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -393,7 +1151,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164DAGISEL-NEXT: v_sub_f32_e64 v3, s2, s4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -404,7 +1162,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s2, 0
-; GFX1164GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -413,7 +1171,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164GISEL-NEXT: v_sub_f32_e64 v3, s2, s4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -424,7 +1182,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -433,7 +1191,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132DAGISEL-NEXT: v_sub_f32_e64 v3, s1, s3
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -444,7 +1202,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1132GISEL-NEXT: s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2
@@ -453,7 +1211,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132GISEL-NEXT: v_sub_f32_e64 v3, s1, s3
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v2, off
@@ -468,7 +1226,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
-; GFX12DAGISEL-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -479,7 +1237,7 @@ define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) #0 {
; GFX12DAGISEL-NEXT: v_sub_f32_e64 v3, s1, s3
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1
@@ -509,7 +1267,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX8DAGISEL-NEXT: s_nop 1
; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
-; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT: v_Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: s_nop 1
; GFX8DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
; GFX8DAGISEL-NEXT: v_sub_f32_e32 v3, 0, v3
@@ -571,7 +1331,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX9DAGISEL-NEXT: s_nop 1
; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: s_nop 1
-; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: v3 row_bcast:15 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: s_nop 1
; GFX9DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_bcast:31 row_mask:0xf bank_mask:0xf
; GFX9DAGISEL-NEXT: v_sub_f32_e32 v3, 0, v3
@@ -632,7 +1394,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v5
; GFX1064DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v5, 32, v5
-; GFX1064DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT: v_add_f32_dpp vLowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: 3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v5, 4, v5
; GFX1064DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
@@ -713,7 +1477,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX1032DAGISEL-NEXT: v_sub_f32_e32 v3, 0, v3
+; GFX1032DAGISEL-NEXT: v_subLowering intrinsic: 3658
+; GFX1032DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1032DAGISEL-NEXT: _f32_e32 v3, 0, v3
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v3, 31
; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5
@@ -762,7 +1528,9 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x2 ; 12-byte Folded Spill
+; GFX1164DAGISEL-NEXT: s_clause 0x2 Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: ; 12-byte Folded Spill
; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v3, s32
; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:8
@@ -848,39 +1616,41 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_float_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v3, s32
-; GFX1132DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v3, 0, v2, s0
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_sub_f32_e32 v3, v3, v4
-; GFX1132DAGISEL-NEXT: v_sub_f32_e32 v3, 0, v3
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v3, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1
-; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v3, off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:4
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scrLowering intrinsic: 3658
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-FAKE16-NEXT: atch_store_b32 off, v3, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-FAKE16-NEXT: v_sub_f32_e32 v3, 0, v3
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_float_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -916,11 +1686,49 @@ define void @divergent_value_float_dpp(ptr addrspace(1) %out, float %id.x) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_float_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded SpilLowering intrinsic: 3658
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-TRUE16-NEXT: l
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v3, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v4, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v4, v3 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX1132DAGISEL-TRUE16-NEXT: v_sub_f32_e32 v3, 0, v3
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v3, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s1
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b32 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 8-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v3, off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b32 v4, off, s32 offset:4
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_float_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
-; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12DAGISEL-NEXT: s_waiLowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: t_samplecnt 0x0
; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
@@ -1027,7 +1835,17 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX8DAGISEL-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT: buffer_load_dLowering intrinsic: 3713
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX8DAGISEL-NEXT: word v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX8DAGISEL-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
@@ -1184,7 +2002,17 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
; GFX9DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9DAGISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-bytLowering intrinsic: 3713
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX9DAGISEL-NEXT: e Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9DAGISEL-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -1330,7 +2158,13 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1064DAGISEL-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v4, 63
; GFX1064DAGISEL-NEXT: v_readlane_b32 s7, v5, 63
-; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT: s_mov_b64 exeLowering intrinsic: 3713
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1064DAGISEL-NEXT: c, s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
@@ -1530,82 +2364,88 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1164DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Spill
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
-; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v6, v4
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
-; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v6, 32, v6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v6, 4, v6
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v7, v6, v4
-; GFX1164DAGISEL-NEXT: ds_permute_b32 v8, v6, v5
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[7:8]
-; GFX1164DAGISEL-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s2, v4, 63
-; GFX1164DAGISEL-NEXT: v_readlane_b32 s3, v5, 63
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1164DAGISEL-NEXT: s_xor_saveexec_b64 s[0:1], -1
-; GFX1164DAGISEL-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
-; GFX1164DAGISEL-NEXT: s_clause 0x3 ; 28-byte Folded Reload
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1164DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:16
-; GFX1164DAGISEL-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
-; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 Lowering intrinsic: 3713
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-FAKE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-FAKE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-FAKE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164GISEL-LABEL: divergent_value_double_dpp:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -1684,60 +2524,66 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX1132DAGISEL-LABEL: divergent_value_double_dpp:
-; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Spill
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[4:5], s32
-; GFX1132DAGISEL-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
-; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
-; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132DAGISEL-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
-; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s0, v4, 31
-; GFX1132DAGISEL-NEXT: v_readlane_b32 s1, v5, 31
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX1132DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
-; GFX1132DAGISEL-NEXT: s_clause 0x1 ; 16-byte Folded Reload
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[4:5], off, s32
-; GFX1132DAGISEL-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
-; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-FAKE16-NEXT: v_addLowering intrinsic: 3713
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-FAKE16-NEXT: _f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-FAKE16-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
+; GFX1132DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-FAKE16-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-FAKE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-FAKE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-FAKE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-FAKE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX1132GISEL-LABEL: divergent_value_double_dpp:
; GFX1132GISEL: ; %bb.0: ; %entry
@@ -1794,6 +2640,150 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Spill
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b32 off, v6, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[7:8], s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b3Lowering intrinsic: 3713
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: 2_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v6, -1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v6, -1, v6
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_nc_u32_e32 v6, 32, v6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_lo_u32 v6, 4, v6
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v7, v6, v4
+; GFX1164DAGISEL-TRUE16-NEXT: ds_permute_b32 v8, v6, v5
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[7:8]
+; GFX1164DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-TRUE16-NEXT: v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_clause 0x3 ; 28-byte Folded Reload
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b32 v6, off, s32 offset:16
+; GFX1164DAGISEL-TRUE16-NEXT: scratch_load_b64 v[7:8], off, s32 offset:20
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_double_dpp:
+; GFX1132DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Spill
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[4:5], s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_store_b64 off, v[6:7], s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3713
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132DAGISEL-TRUE16-NEXT: v_add_f64 v[4:5], 0x80000000, -v[4:5]
+; GFX1132DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-TRUE16-NEXT: v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-TRUE16-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-TRUE16-NEXT: s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT: s_clause 0x1 ; 16-byte Folded Reload
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[4:5], off, s32
+; GFX1132DAGISEL-TRUE16-NEXT: scratch_load_b64 v[6:7], off, s32 offset:8
+; GFX1132DAGISEL-TRUE16-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX12DAGISEL-LABEL: divergent_value_double_dpp:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1829,7 +2819,13 @@ define void @divergent_value_double_dpp(ptr addrspace(1) %out, double %in) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: v_add_f64_e32 v[4:5], v[4:5], v[6:7]
-; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALLowering intrinsic: 3713
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: U_DEP_1)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
; GFX12DAGISEL-NEXT: v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1869,7 +2865,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1877,11 +2873,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1890,7 +2886,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX8DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1907,7 +2903,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1915,11 +2911,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX8GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB6_2: ; %Flow
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1928,7 +2924,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: .LBB4_4: ; %endif
+; GFX8GISEL-NEXT: .LBB6_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1945,7 +2941,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr6
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1953,11 +2949,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -1966,7 +2962,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX9DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -1982,7 +2978,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -1990,11 +2986,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB6_2: ; %Flow
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2003,7 +2999,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX9GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX9GISEL-NEXT: .LBB4_4: ; %endif
+; GFX9GISEL-NEXT: .LBB6_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2019,7 +3015,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2027,11 +3023,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2040,7 +3036,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1064DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2056,7 +3052,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2064,11 +3060,11 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1064GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2077,7 +3073,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1064GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1064GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB6_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2093,7 +3089,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -2101,12 +3097,12 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
@@ -2114,7 +3110,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1032DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2129,7 +3125,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr3
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -2137,12 +3133,12 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1032GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2
@@ -2150,7 +3146,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1032GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB6_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2158,49 +3154,55 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX1032GISEL-NEXT: s_endpgm
;
-; GFX1164DAGISEL-LABEL: divergent_cfg_float:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
-; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr6
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
-; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2
-; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow
-; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_4
-; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0
-; GFX1164DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0
-; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1164DAGISEL-NEXT: .LBB4_4: ; %endif
-; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX1164DAGISEL-NEXT: s_endpgm
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-FAKE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-FAKE16-NEXT: ; implicit-def: $sgpr6
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_f32_e64 v0, -s0, v0
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_2: ; %Flow
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-FAKE16-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-FAKE16-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-FAKE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_cvt_Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-FAKE16-NEXT: f32_i32_e32 v0, s0
+; GFX1164DAGISEL-FAKE16-NEXT: v_mul_f32_e64 v0, -s1, v0
+; GFX1164DAGISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-FAKE16-NEXT: .LBB6_4: ; %endif
+; GFX1164DAGISEL-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_cfg_float:
; GFX1164GISEL: ; %bb.0: ; %entry
@@ -2211,7 +3213,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2221,12 +3223,12 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1164GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -2237,7 +3239,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX1164GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB6_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
@@ -2255,7 +3257,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2265,13 +3267,13 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2282,7 +3284,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1132DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2299,7 +3301,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2309,13 +3311,13 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX1132GISEL-NEXT: .LBB4_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB6_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2326,7 +3328,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX1132GISEL-NEXT: .LBB4_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB6_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2334,6 +3336,56 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_cfg_float:
+; GFX1164DAGISEL-TRUE16: ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
+; GFX1164DAGISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-TRUE16-NEXT: ; implicit-def: $sgpr6
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_cmpx_lt_u32_e32 15, v0
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.1: ; %else
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_f32_e64 v0, -s0, v0
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_2: ; %Flow
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164DAGISEL-TRUE16-NEXT: s_xor_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_cbranch_execz .LBB6_4
+; GFX1164DAGISEL-TRUE16-NEXT: ; %bb.3: ; %if
+; GFX1164DAGISEL-TRUE16-NEXT: s_mov_b64 s[6:7], exec
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX1164DAGISEL-TRUE16-NEXT: v_mul_f32_e64 v0, -s1, v0
+; GFX1164DAGISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164DAGISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX1164DAGISEL-TRUE16-NEXT: .LBB6_4: ; %endif
+; GFX1164DAGISEL-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164DAGISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT: s_endpgm
+;
; GFX12DAGISEL-LABEL: divergent_cfg_float:
; GFX12DAGISEL: ; %bb.0: ; %entry
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
@@ -2343,7 +3395,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2353,14 +3405,14 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GFX12DAGISEL-NEXT: .LBB4_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB6_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s2
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s3
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB6_4
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2373,7 +3425,7 @@ define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in,
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s1
-; GFX12DAGISEL-NEXT: .LBB4_4: ; %endif
+; GFX12DAGISEL-NEXT: .LBB6_4: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0
@@ -2634,7 +3686,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], 0
; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2645,7 +3697,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2658,7 +3710,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], 0
; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX8GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2669,7 +3721,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX8GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2682,7 +3734,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], 0
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2693,7 +3745,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2706,7 +3758,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], 0
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX9GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s6
; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10
@@ -2717,7 +3769,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX9GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2730,7 +3782,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], 0
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2739,7 +3791,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064DAGISEL-NEXT: v_add_f64 v[4:5], -s[8:9], s[6:7]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2751,7 +3803,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], 0
; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec
-; GFX1064GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[4:5]
; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10
; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10
@@ -2760,7 +3812,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1064GISEL-NEXT: v_add_f64 v[4:5], -s[8:9], s[6:7]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v4
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s7, v5
-; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s7
@@ -2772,7 +3824,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2781,7 +3833,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032DAGISEL-NEXT: v_add_f64 v[4:5], -s[8:9], s[4:5]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2793,7 +3845,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6
; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7
; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7
@@ -2802,7 +3854,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032GISEL-NEXT: v_add_f64 v[4:5], -s[8:9], s[4:5]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s4, v4
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v5
-; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032GISEL-NEXT: ; %bb.2:
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5
@@ -2810,11 +3862,19 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1164DAGISEL-LABEL: divergent_value_double:
-; GFX1164DAGISEL: ; %bb.0: ; %entry
+; GFX1164DAGISEL: ; %bb.0: Lowering intrinsic: 3713
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX1164DAGISEL-NEXT: ; %entry
; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], 0
; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2826,7 +3886,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2838,7 +3898,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], 0
; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[0:1]
; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6
@@ -2850,7 +3910,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v4
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s3, v5
-; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -2862,7 +3922,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2874,7 +3934,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2885,7 +3945,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3
@@ -2897,7 +3957,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132GISEL-NEXT: ; %bb.2:
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -2912,7 +3972,7 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
-; GFX12DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
+; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -2925,12 +3985,20 @@ define void @divergent_value_double(ptr addrspace(1) %out, double %id.x) #0 {
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s0, v4
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v5
-; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
; GFX12DAGISEL-NEXT: ; %bb.2:
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
-; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:3Lowering intrinsic: 3713
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3298
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: Lowering intrinsic: 3658
+; GFX12DAGISEL-NEXT: 1]
entry:
%result = call double @llvm.amdgcn.wave.reduce.fsub(double %id.x, i32 1)
store double %result, ptr addrspace(1) %out
@@ -2946,7 +4014,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8DAGISEL-NEXT: ; %bb.1: ; %else
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2955,13 +4023,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8DAGISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX8DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX8DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8DAGISEL-NEXT: ; %bb.3: ; %if
; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2971,7 +4039,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX8DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX8DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -2985,7 +4053,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX8GISEL-NEXT: ; %bb.1: ; %else
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -2994,13 +4062,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX8GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX8GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX8GISEL-NEXT: .LBB9_2: ; %Flow
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX8GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX8GISEL-NEXT: ; %bb.3: ; %if
; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
@@ -3012,7 +4080,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX8GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT: .LBB7_4: ; %endif
+; GFX8GISEL-NEXT: .LBB9_4: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -3027,7 +4095,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9DAGISEL-NEXT: ; %bb.1: ; %else
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3036,13 +4104,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9DAGISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX9DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9DAGISEL-NEXT: ; %bb.3: ; %if
; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3052,7 +4120,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX9DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3065,7 +4133,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc
; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX9GISEL-NEXT: ; %bb.1: ; %else
; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3074,13 +4142,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX9GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX9GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX9GISEL-NEXT: .LBB9_2: ; %Flow
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9]
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX9GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX9GISEL-NEXT: ; %bb.3: ; %if
; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec
@@ -3092,7 +4160,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX9GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT: .LBB7_4: ; %endif
+; GFX9GISEL-NEXT: .LBB9_4: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3107,7 +4175,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3116,13 +4184,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064DAGISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1064DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1064DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec
; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
@@ -3132,7 +4200,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1064DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1064DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1064DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3145,7 +4213,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064GISEL-NEXT: ; implicit-def: $sgpr8_sgpr9
; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX1064GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1064GISEL-NEXT: ; %bb.1: ; %else
; GFX1064GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
@@ -3154,13 +4222,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064GISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1064GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1064GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1064GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1064GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1064GISEL-NEXT: ; %bb.3: ; %if
; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
@@ -3172,7 +4240,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1064GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1064GISEL-NEXT: .LBB9_4: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1064GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3187,7 +4255,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
@@ -3196,13 +4264,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032DAGISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1032DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1032DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -3212,7 +4280,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1032DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3225,7 +4293,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo
; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1032GISEL-NEXT: ; %bb.1: ; %else
; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6
@@ -3234,13 +4302,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032GISEL-NEXT: v_mul_f64 v[0:1], -s[2:3], v[0:1]
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1032GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1032GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7
; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1032GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1032GISEL-NEXT: ; %bb.3: ; %if
; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
@@ -3252,7 +4320,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1032GISEL-NEXT: .LBB9_4: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3269,7 +4337,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3281,14 +4349,14 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1164DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1164DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3302,7 +4370,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1164DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1164DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1164DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3317,7 +4385,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1164GISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1164GISEL-NEXT: ; %bb.1: ; %else
; GFX1164GISEL-NEXT: s_mov_b64 s[8:9], exec
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3329,14 +4397,14 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_readfirstlane_b32 s9, v1
-; GFX1164GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1164GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7]
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s8
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s9
; GFX1164GISEL-NEXT: s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1164GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1164GISEL-NEXT: ; %bb.3: ; %if
; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec
; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
@@ -3351,7 +4419,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1164GISEL-NEXT: .LBB9_4: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1164GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3368,7 +4436,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else
; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3380,13 +4448,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1132DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1132DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if
; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3398,7 +4466,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX1132DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3413,7 +4481,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0
; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX1132GISEL-NEXT: ; %bb.1: ; %else
; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3425,13 +4493,13 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX1132GISEL-NEXT: .LBB7_2: ; %Flow
+; GFX1132GISEL-NEXT: .LBB9_2: ; %Flow
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX1132GISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX1132GISEL-NEXT: ; %bb.3: ; %if
; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
@@ -3445,7 +4513,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX1132GISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT: .LBB7_4: ; %endif
+; GFX1132GISEL-NEXT: .LBB9_4: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -3462,7 +4530,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_2
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3474,7 +4542,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s6, v0
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s7, v1
-; GFX12DAGISEL-NEXT: .LBB7_2: ; %Flow
+; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
@@ -3482,7 +4550,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
-; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB7_4
+; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_4
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
@@ -3496,7 +4564,7 @@ define amdgpu_kernel void @divergent_cfg_double(ptr addrspace(1) %out, double %i
; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s5, v1
; GFX12DAGISEL-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12DAGISEL-NEXT: .LBB7_4: ; %endif
+; GFX12DAGISEL-NEXT: .LBB9_4: ; %endif
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
More information about the cfe-commits
mailing list