[llvm] [AMDGPU] Swap select operands to allow later v_cndmask shrinking into vop2 (PR #142140)
Ana Mihajlovic via llvm-commits
llvm-commits at lists.llvm.org
Fri May 30 06:31:14 PDT 2025
https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/142140
>From 25b2dd8526b52f407bbc51431289768564c11363 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 11:43:34 +0200
Subject: [PATCH 1/3] test precommit
---
llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll | 764 +++++++++++++++++++++
1 file changed, 764 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
new file mode 100644
index 0000000000000..12ccdfff07c6f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -0,0 +1,764 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GCN
+
+;tests for integer 32
+define amdgpu_cs void @test_i32_sge(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_sge:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp sge i32 %a, 2
+ %val1 = select i1 %vcc, i32 %p, i32 0
+ %val2 = select i1 %vcc, i32 %q, i32 0
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_i32_sle(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_sle:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp sle i32 %a, 2
+ %val1 = select i1 %vcc, i32 %p, i32 0
+ %val2 = select i1 %vcc, i32 %q, i32 0
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_i32_sgt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_sgt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 2, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp sgt i32 2, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_i32_slt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_slt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 2, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp slt i32 2, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+;tests for integer 64
+define amdgpu_cs void @test_i64_sge(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_sge:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp sge i64 %a, 2
+ %val1 = select i1 %vcc, i64 %p, i64 0
+ %val2 = select i1 %vcc, i64 %q, i64 0
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_i64_sle(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_sle:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp sle i64 %a, 2
+ %val1 = select i1 %vcc, i64 %p, i64 0
+ %val2 = select i1 %vcc, i64 %q, i64 0
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_i64_sgt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_sgt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp sgt i64 2, %a
+ %val1 = select i1 %vcc, i64 0, i64 %p
+ %val2 = select i1 %vcc, i64 0, i64 %q
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_i64_slt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_slt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp slt i64 2, %a
+ %val1 = select i1 %vcc, i64 0, i64 %p
+ %val2 = select i1 %vcc, i64 0, i64 %q
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+;tests for unsigned 32
+define amdgpu_cs void @test_u32_eq(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_eq:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp eq i32 1, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_negative_case(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_negative_case:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp eq i32 %a, -1
+ %val1 = select i1 %vcc, i32 %p, i32 0
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr addrspace(1) %out) {
+; GCN-LABEL: test_mixed:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[5:6], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp eq i32 -1, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 %q, i32 0
+ %val3 = select i1 %vcc, i32 0, i32 %r
+ %val4 = select i1 %vcc, i32 0, i32 %s
+ %ret0 = insertelement <4 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <4 x i32> %ret0, i32 %val2, i32 1
+ %ret2 = insertelement <4 x i32> %ret1, i32 %val3, i32 2
+ %ret3 = insertelement <4 x i32> %ret2, i32 %val4, i32 3
+ store <4 x i32> %ret3, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr addrspace(1) %out) {
+; GCN-LABEL: test_sgpr:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v5, s0, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v6, s1, 0, vcc_lo
+; GCN-NEXT: global_store_b96 v[2:3], v[4:6], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp eq i32 %a, -1
+ %val1 = select i1 %vcc, i32 %p, i32 0
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %val3 = select i1 %vcc, i32 0, i32 %r
+ %ret0 = insertelement <3 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <3 x i32> %ret0, i32 %val2, i32 1
+ %ret2 = insertelement <3 x i32> %ret1, i32 %val3, i32 2
+ store <3 x i32> %ret2, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u32_ne(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ne:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ne i32 1, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u32_uge(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_uge:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp uge i32 %a, 2
+ %val1 = select i1 %vcc, i32 %p, i32 0
+ %val2 = select i1 %vcc, i32 %q, i32 0
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u32_ule(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ule:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 3, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ule i32 %a, 2
+ %val1 = select i1 %vcc, i32 %p, i32 0
+ %val2 = select i1 %vcc, i32 %q, i32 0
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u32_ugt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ugt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ugt i32 2, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u32_ult(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ult:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 2, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ult i32 2, %a
+ %val1 = select i1 %vcc, i32 0, i32 %p
+ %val2 = select i1 %vcc, i32 0, i32 %q
+ %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+ %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+ store <2 x i32> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+;tests for unsigned 64
+define amdgpu_cs void @test_u64_eq(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_eq:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp eq i64 1, %a
+ %val1 = select i1 %vcc, i64 0, i64 %p
+ %val2 = select i1 %vcc, i64 0, i64 %q
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u64_ne(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ne:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_ne_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ne i64 1, %a
+ %val1 = select i1 %vcc, i64 0, i64 %p
+ %val2 = select i1 %vcc, i64 0, i64 %q
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u64_uge(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_uge:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp uge i64 %a, 2
+ %val1 = select i1 %vcc, i64 %p, i64 0
+ %val2 = select i1 %vcc, i64 %q, i64 0
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u64_ule(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ule:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ule i64 %a, 2
+ %val1 = select i1 %vcc, i64 %p, i64 0
+ %val2 = select i1 %vcc, i64 %q, i64 0
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u64_ugt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ugt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ugt i64 2, %a
+ %val1 = select i1 %vcc, i64 0, i64 %p
+ %val2 = select i1 %vcc, i64 0, i64 %q
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_u64_ult(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ult:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = icmp ult i64 2, %a
+ %val1 = select i1 %vcc, i64 0, i64 %p
+ %val2 = select i1 %vcc, i64 0, i64 %q
+ %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+ store <2 x i64> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+;tests for float 32
+define amdgpu_cs void @test_f32_oeq(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_oeq:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp oeq float %a, 2.0
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_negative_modifiers(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_negative_modifiers:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, -v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, -v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %r = fneg float %p
+ %s = fneg float %q
+ %vcc = fcmp oeq float 2.0, %a
+ %val1 = select i1 %vcc, float 0.0, float %r
+ %val2 = select i1 %vcc, float 0.0, float %s
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_one(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_one:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lg_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp one float %a, 2.0
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_ord(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_ord:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp ord float %a, 2.0
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_uno(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_uno:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp uno float %a, 2.0
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_oge(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_oge:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_ge_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp oge float 2.0, %a
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_ole(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_ole:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_le_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp ole float 2.0, %a
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_ogt(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_ogt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp ogt float 2.0, %a
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f32_olt(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_olt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp olt float 2.0, %a
+ %val1 = select i1 %vcc, float 0.0, float %p
+ %val2 = select i1 %vcc, float 0.0, float %q
+ %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+ %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+ store <2 x float> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+;tests for float64
+define amdgpu_cs void @test_f64_oeq(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_oeq:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_eq_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp oeq double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_one(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_one:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lg_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp one double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_oge(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_oge:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_ge_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp oge double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_ole(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_ole:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_le_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp ole double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_ogt(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_ogt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_gt_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp ogt double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_olt(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_olt:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_lt_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp olt double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_ord(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_ord:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp ord double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_cs void @test_f64_uno(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_uno:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT: s_endpgm
+.entry:
+ %vcc = fcmp uno double 2.0, %a
+ %val1 = select i1 %vcc, double 0.0, double %p
+ %val2 = select i1 %vcc, double 0.0, double %q
+ %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+ store <2 x double> %ret1, ptr addrspace(1) %out
+ ret void
+}
>From 91f5ad7bd6b815e163815946172e5b51dd7b65c3 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 14:47:02 +0200
Subject: [PATCH 2/3] [AMDGPU] Swap select operands to allow later v_cndmask
shrinking into vop2
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 34 +
.../GlobalISel/divergence-structurizer.ll | 10 +-
llvm/test/CodeGen/AMDGPU/ctlz.ll | 20 +-
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 16 +-
llvm/test/CodeGen/AMDGPU/cttz.ll | 20 +-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 12 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 182 ++---
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 754 +++++++++---------
.../issue130120-eliminate-frame-index.ll | 34 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 124 ++-
.../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 180 ++---
llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll | 95 +--
12 files changed, 718 insertions(+), 763 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 52177a2523bcb..7f17132be12aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1775,6 +1775,40 @@ bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
UA.isUniform(&I))
Changed |= promoteUniformOpToI32(I);
+ // check if select operands should be swapped
+ // so that v_cndmask can be later shrinked into
+ // vop2
+ int ShouldSwap = 0;
+ for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
+ auto User = Use->getUser();
+
+ if (!isa<SelectInst>(User))
+ return Changed;
+
+ auto SelectI = dyn_cast<SelectInst>(User);
+
+ if (isa<Constant>(SelectI->getOperand(1)) &&
+ !isa<Constant>(SelectI->getOperand(2)))
+ ShouldSwap++;
+ else if (!isa<Constant>(SelectI->getOperand(1)) &&
+ isa<Constant>(SelectI->getOperand(2)))
+ ShouldSwap--;
+ }
+
+ if (ShouldSwap <= 0)
+ return Changed;
+
+ I.setPredicate(I.getInverseCmpPredicate());
+
+ for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
+ auto SelectI = dyn_cast<Instruction>(Use->getUser());
+
+ auto Op = SelectI->getOperand(1);
+
+ SelectI->setOperand(1, SelectI->getOperand(2));
+ SelectI->setOperand(2, Op);
+ }
+
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index e31077dd1986f..71f4bfaab77c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -547,14 +547,16 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
; GFX10-NEXT: s_xor_b32 s5, exec_lo, s5
; GFX10-NEXT: ; %bb.3: ; %.loopexit
; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
-; GFX10-NEXT: v_cmp_gt_i32_e64 s0, v5, v0
+; GFX10-NEXT: v_cmp_le_i32_e64 s0, v5, v0
; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_mov_b32 s7, exec_lo
; GFX10-NEXT: s_xor_b32 s6, vcc_lo, s6
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
+; GFX10-NEXT: s_xor_b32 s7, s0, s7
; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
-; GFX10-NEXT: s_or_b32 s6, s0, s6
+; GFX10-NEXT: s_or_b32 s6, s7, s6
; GFX10-NEXT: s_and_b32 s0, exec_lo, s0
-; GFX10-NEXT: s_xor_b32 s6, s6, s7
+; GFX10-NEXT: s_xor_b32 s6, s6, s8
; GFX10-NEXT: s_andn2_b32 s4, s4, exec_lo
; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
; GFX10-NEXT: s_or_b32 s3, s3, s0
@@ -588,7 +590,7 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
; GFX10-NEXT: s_branch .LBB6_1
; GFX10-NEXT: .LBB6_8: ; %.exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v3, s2
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s2
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
.entry:
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 52c90817dddd1..e3cc8ee340f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1100,9 +1100,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
@@ -1328,8 +1328,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
@@ -1565,10 +1565,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe8, v1
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, 0xffff, v1, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
@@ -1676,10 +1676,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, -16
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
@@ -1790,10 +1790,10 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe7, v1
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, 0x7f, v1, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 9503ffbdb4104..5e24af3e71c1d 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1543,8 +1543,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
@@ -1713,8 +1713,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1898,8 +1898,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
@@ -2067,8 +2067,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 7f83fc571bf29..d997904d81d54 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -951,9 +951,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
@@ -1153,8 +1153,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1357,9 +1357,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, s2
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, 0xffff, v1, vcc_lo
; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1455,9 +1455,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
-; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%val = load i16, ptr addrspace(1) %valptr
@@ -1558,8 +1558,8 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
+; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, 0x7f, v1, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 97bcd8b5ee68a..c1abc3002a990 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1168,8 +1168,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%val = load i16, ptr addrspace(1) %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..7c525c0a66070 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -24,17 +24,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-NEXT: v_ffbh_u32_e32 v13, v9
; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v21, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
; GFX9-NEXT: v_or_b32_e32 v3, v20, v1
; GFX9-NEXT: v_or_b32_e32 v2, v21, v0
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v3, v9, v11
-; GFX9-NEXT: v_or_b32_e32 v2, v8, v10
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ffbh_u32_e32 v2, v0
; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
; GFX9-NEXT: v_ffbh_u32_e32 v3, v1
@@ -43,52 +40,49 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
; GFX9-NEXT: v_ffbh_u32_e32 v4, v20
; GFX9-NEXT: v_min_u32_e32 v3, v3, v4
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 64, v3
; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT: v_ffbh_u32_e32 v6, v11
+; GFX9-NEXT: v_ffbh_u32_e32 v5, v11
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; GFX9-NEXT: v_ffbh_u32_e32 v3, v10
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
-; GFX9-NEXT: v_min_u32_e32 v3, v3, v6
-; GFX9-NEXT: v_ffbh_u32_e32 v6, v8
-; GFX9-NEXT: v_add_u32_e32 v6, 32, v6
-; GFX9-NEXT: v_ffbh_u32_e32 v7, v9
-; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT: v_min_u32_e32 v3, v3, v5
+; GFX9-NEXT: v_ffbh_u32_e32 v5, v8
+; GFX9-NEXT: v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT: v_min_u32_e32 v5, v5, v13
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 64, v6
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[6:7], 0, 0, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 64, v5
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[6:7], 0, 0, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT: v_mov_b32_e32 v5, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX9-NEXT: v_mov_b32_e32 v12, 0
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v18, v16
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v19, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v13, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v12, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-NEXT: v_or_b32_e32 v7, v9, v11
+; GFX9-NEXT: v_or_b32_e32 v6, v8, v10
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v12, vcc
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[6:7]
; GFX9-NEXT: v_xor_b32_e32 v6, 0x7f, v2
; GFX9-NEXT: v_or_b32_e32 v7, v3, v5
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v18, v16
+; GFX9-NEXT: v_mov_b32_e32 v19, v17
; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_6
; GFX9-NEXT: ; %bb.1: ; %udiv-bb1
@@ -1241,10 +1235,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v17, vcc
; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4
; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5
-; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-G-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX9-G-NEXT: v_or_b32_e32 v0, v8, v10
; GFX9-G-NEXT: v_or_b32_e32 v1, v9, v11
-; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GFX9-G-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18
; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19
; GFX9-G-NEXT: v_add_u32_e32 v1, 32, v1
@@ -1273,10 +1267,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7]
; GFX9-G-NEXT: v_mov_b32_e32 v7, 0
; GFX9-G-NEXT: v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7]
-; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7]
-; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT: v_cmp_le_u64_e64 s[6:7], v[0:1], v[6:7]
+; GFX9-G-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX9-G-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT: v_cmp_ge_u64_e64 s[6:7], 0, v[2:3]
; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3
; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7]
; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
@@ -1292,7 +1286,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc
; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
-; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-G-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX9-G-NEXT: v_xor_b32_e32 v20, 1, v20
; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14
; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14
@@ -2307,64 +2302,58 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-LABEL: v_udiv_i128_vv:
; GFX9: ; %bb.0: ; %_udiv-special-cases
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v9, v5, v7
-; GFX9-NEXT: v_or_b32_e32 v8, v4, v6
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v9, v1, v3
-; GFX9-NEXT: v_or_b32_e32 v8, v0, v2
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
-; GFX9-NEXT: v_ffbh_u32_e32 v8, v6
-; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
-; GFX9-NEXT: v_ffbh_u32_e32 v9, v7
-; GFX9-NEXT: v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT: v_ffbh_u32_e32 v9, v4
-; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT: v_ffbh_u32_e32 v10, v5
-; GFX9-NEXT: v_min_u32_e32 v9, v9, v10
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9
-; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_ffbh_u32_e32 v11, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT: v_ffbh_u32_e32 v9, v2
-; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT: v_min_u32_e32 v9, v9, v11
-; GFX9-NEXT: v_ffbh_u32_e32 v11, v0
+; GFX9-NEXT: v_ffbh_u32_e32 v10, v6
+; GFX9-NEXT: v_add_u32_e32 v10, 32, v10
+; GFX9-NEXT: v_ffbh_u32_e32 v11, v7
+; GFX9-NEXT: v_min_u32_e32 v10, v10, v11
+; GFX9-NEXT: v_ffbh_u32_e32 v11, v4
; GFX9-NEXT: v_add_u32_e32 v11, 32, v11
-; GFX9-NEXT: v_ffbh_u32_e32 v12, v1
+; GFX9-NEXT: v_ffbh_u32_e32 v12, v5
; GFX9-NEXT: v_min_u32_e32 v11, v11, v12
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 64, v11
; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT: v_ffbh_u32_e32 v14, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX9-NEXT: v_ffbh_u32_e32 v11, v2
; GFX9-NEXT: v_cndmask_b32_e64 v13, v12, 0, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v8, v9
-; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v13, vcc
-; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
-; GFX9-NEXT: v_or_b32_e32 v10, v13, v15
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_xor_b32_e32 v9, 0x7f, v12
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v14
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: v_add_u32_e32 v11, 32, v11
+; GFX9-NEXT: v_ffbh_u32_e32 v12, v3
+; GFX9-NEXT: v_min_u32_e32 v11, v11, v12
+; GFX9-NEXT: v_ffbh_u32_e32 v12, v0
+; GFX9-NEXT: v_add_u32_e32 v12, 32, v12
+; GFX9-NEXT: v_min_u32_e32 v12, v12, v14
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 64, v12
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, 0, vcc
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v9, v5, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v10, v11
+; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-NEXT: v_or_b32_e32 v8, v4, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v10, vcc
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-NEXT: v_or_b32_e32 v9, v1, v3
+; GFX9-NEXT: v_or_b32_e32 v8, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v10, vcc
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[12:13]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[8:9]
+; GFX9-NEXT: v_xor_b32_e32 v8, 0x7f, v12
+; GFX9-NEXT: v_or_b32_e32 v9, v13, v15
+; GFX9-NEXT: v_or_b32_e32 v8, v8, v14
+; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB1_6
; GFX9-NEXT: ; %bb.1: ; %udiv-bb1
@@ -3368,10 +3357,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-G-NEXT: v_or_b32_e32 v8, v4, v6
; GFX9-G-NEXT: v_or_b32_e32 v9, v5, v7
-; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-G-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GFX9-G-NEXT: v_or_b32_e32 v8, v0, v2
; GFX9-G-NEXT: v_or_b32_e32 v9, v1, v3
-; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-G-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v4
; GFX9-G-NEXT: v_ffbh_u32_e32 v8, v5
; GFX9-G-NEXT: v_add_u32_e32 v9, 32, v9
@@ -3400,10 +3389,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7]
; GFX9-G-NEXT: v_mov_b32_e32 v9, 0
; GFX9-G-NEXT: v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7]
-; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9]
-; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT: v_cmp_le_u64_e64 s[6:7], v[12:13], v[8:9]
+; GFX9-G-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[6:7]
-; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[14:15]
+; GFX9-G-NEXT: v_cmp_ge_u64_e64 s[6:7], 0, v[14:15]
; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15
; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7]
; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
@@ -3419,7 +3408,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc
; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
-; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX9-G-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX9-G-NEXT: v_xor_b32_e32 v18, 1, v18
; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16
; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..7e373f08f7c85 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -7,100 +7,94 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x80
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, v24
; SDAG-NEXT: v_mov_b32_e32 v27, v25
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v18, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v2, v19, s[4:5]
; SDAG-NEXT: v_ffbh_u32_e32 v1, v20
; SDAG-NEXT: v_ffbh_u32_e32 v2, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v3, v0, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v0, v20, v18
; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
-; SDAG-NEXT: v_or_b32_e32 v1, v21, v17
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v18
+; SDAG-NEXT: v_or_b32_e32 v1, v21, v19
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v2, v16, v2
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v22
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v19
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT: v_min_u32_e32 v1, v19, v22
+; SDAG-NEXT: v_min_u32_e32 v1, v16, v22
; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc
; SDAG-NEXT: v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v28
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v28
; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3
; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT: v_min_u32_e32 v8, v8, v19
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v16
; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v1
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v2, v11, v19
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v1
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_min_u32_e32 v2, v11, v16
; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8
; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: s_and_b64 s[8:9], vcc, s[4:5]
; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v17, vcc
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[10:11], v[2:3]
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v17, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
; SDAG-NEXT: v_or_b32_e32 v9, v3, v11
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5]
-; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
-; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v18, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB0_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2
-; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v2
+; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v2
; SDAG-NEXT: v_mov_b32_e32 v8, 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[20:21], v16
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2
; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], v34
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], v34
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
; SDAG-NEXT: v_lshl_b64 v[22:23], v[20:21], v34
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
@@ -108,13 +102,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v3, v3, v11
; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v22, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: v_mov_b32_e32 v11, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -123,18 +117,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v30
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[18:19], v10
; SDAG-NEXT: v_or_b32_e32 v11, v9, v11
; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[18:19], v8
; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30
; SDAG-NEXT: v_cndmask_b32_e64 v21, v9, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v20, v8, v20, s[4:5]
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[18:19], v30
; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
@@ -143,27 +137,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v10, 0
; SDAG-NEXT: v_mov_b32_e32 v11, 0
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_mov_b64 s[4:5], 0
; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v3
; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_or_b32_e32 v19, v17, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v16, v18
-; SDAG-NEXT: v_or_b32_e32 v16, v22, v38
-; SDAG-NEXT: v_or_b32_e32 v17, v20, v39
+; SDAG-NEXT: v_or_b32_e32 v17, v19, v17
+; SDAG-NEXT: v_or_b32_e32 v16, v18, v16
+; SDAG-NEXT: v_or_b32_e32 v18, v22, v38
+; SDAG-NEXT: v_or_b32_e32 v19, v20, v39
; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v17
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v19
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v21, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v16, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v18, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc
; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
; SDAG-NEXT: v_and_b32_e32 v20, v8, v29
@@ -171,22 +165,22 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v38, v8, v0
; SDAG-NEXT: v_and_b32_e32 v39, v8, v1
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v19, v20
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc
-; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v18, v38, vcc
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v30, v32
-; SDAG-NEXT: v_or_b32_e32 v17, v31, v33
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_or_b32_e32 v18, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v19, v31, v33
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: v_or_b32_e32 v3, v11, v3
; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; SDAG-NEXT: v_or_b32_e32 v2, v10, v2
-; SDAG-NEXT: v_mov_b32_e32 v17, v9
-; SDAG-NEXT: v_mov_b32_e32 v16, v8
+; SDAG-NEXT: v_mov_b32_e32 v19, v9
+; SDAG-NEXT: v_mov_b32_e32 v18, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB0_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
@@ -194,184 +188,178 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: .LBB0_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[18:19], 1
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v16
-; SDAG-NEXT: v_or_b32_e32 v18, v11, v1
-; SDAG-NEXT: v_or_b32_e32 v19, v9, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v18
+; SDAG-NEXT: v_or_b32_e32 v16, v11, v1
+; SDAG-NEXT: v_or_b32_e32 v17, v9, v3
; SDAG-NEXT: v_or_b32_e32 v22, v10, v0
; SDAG-NEXT: v_or_b32_e32 v23, v8, v2
; SDAG-NEXT: .LBB0_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7
-; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15
+; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v20, v16
-; SDAG-NEXT: v_mov_b32_e32 v21, v17
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x80
+; SDAG-NEXT: v_mov_b32_e32 v20, v18
+; SDAG-NEXT: v_mov_b32_e32 v21, v19
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v4, v3
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12
-; SDAG-NEXT: v_or_b32_e32 v0, v2, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v6
-; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v3, v7
-; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v30, v7
-; SDAG-NEXT: v_min_u32_e32 v4, v10, v4
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v5
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v7, v0, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v12
+; SDAG-NEXT: v_or_b32_e32 v0, v4, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v10
+; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 32, v1
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v13, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v5, v11
+; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], 32, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v30, v11
+; SDAG-NEXT: v_min_u32_e32 v2, v7, v2
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, 0, v14, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_min_u32_e32 v1, v9, v30
-; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4
-; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v28
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v4, v29, v0
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v0
-; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10
-; SDAG-NEXT: v_or_b32_e32 v5, v28, v1
-; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v14, v1
-; SDAG-NEXT: v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_min_u32_e32 v4, v9, v14
-; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10
-; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc
-; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
-; SDAG-NEXT: v_or_b32_e32 v8, v9, v10
+; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v3, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v1, v6, v30
+; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
+; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v15, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v7, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v7, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v8, v28
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v9, v5, v11
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_and_b32_e32 v8, 1, v12
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v3, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v13, v2, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v6, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT: v_add_i32_e32 v7, vcc, 32, v7
+; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
+; SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v14, v1
+; SDAG-NEXT: v_min_u32_e32 v7, v7, v8
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_min_u32_e32 v2, v6, v14
+; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 64, v7
+; SDAG-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; SDAG-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v6, v12, vcc
+; SDAG-NEXT: v_xor_b32_e32 v6, 0x7f, v2
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v9, vcc
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[10:11], v[2:3]
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v8
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_or_b32_e32 v7, v3, v9
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v10, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v4, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4
-; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc
-; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
-; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
-; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4
-; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2
+; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v2
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[4:5], v12
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v2
+; SDAG-NEXT: v_or_b32_e32 v9, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], v34
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35
-; SDAG-NEXT: v_or_b32_e32 v5, v5, v11
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[4:5], v35
+; SDAG-NEXT: v_or_b32_e32 v3, v3, v9
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v13, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v12, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v15, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v14, s[4:5]
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[4:5], v30
; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: v_mov_b32_e32 v13, 0
; SDAG-NEXT: v_mov_b32_e32 v14, 0
; SDAG-NEXT: v_mov_b32_e32 v15, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35
-; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v9, v9, v49
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v48
+; SDAG-NEXT: v_or_b32_e32 v7, v7, v49
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v48
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
; SDAG-NEXT: .LBB0_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5
-; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11
; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v6, v6, v8
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v39
-; SDAG-NEXT: v_or_b32_e32 v5, v13, v5
-; SDAG-NEXT: v_or_b32_e32 v11, v15, v11
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2
-; SDAG-NEXT: v_or_b32_e32 v4, v12, v4
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; SDAG-NEXT: v_and_b32_e32 v15, v8, v29
-; SDAG-NEXT: v_and_b32_e32 v38, v8, v28
-; SDAG-NEXT: v_and_b32_e32 v39, v8, v0
-; SDAG-NEXT: v_and_b32_e32 v48, v8, v1
-; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v5
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_or_b32_e32 v6, v10, v6
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v38
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v39
+; SDAG-NEXT: v_or_b32_e32 v3, v13, v3
+; SDAG-NEXT: v_or_b32_e32 v9, v15, v9
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v4
+; SDAG-NEXT: v_or_b32_e32 v2, v12, v2
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v5, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v11, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10
+; SDAG-NEXT: v_and_b32_e32 v10, v15, v29
+; SDAG-NEXT: v_and_b32_e32 v38, v15, v28
+; SDAG-NEXT: v_and_b32_e32 v39, v15, v0
+; SDAG-NEXT: v_and_b32_e32 v48, v15, v1
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v10
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v6, v39, vcc
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v48, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
@@ -379,47 +367,47 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
-; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_and_b32_e32 v6, 1, v15
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v14, v10
-; SDAG-NEXT: v_mov_b32_e32 v15, v9
-; SDAG-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-NEXT: v_or_b32_e32 v8, v14, v8
+; SDAG-NEXT: v_mov_b32_e32 v15, v7
+; SDAG-NEXT: v_mov_b32_e32 v14, v6
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB0_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB0_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[8:9], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; SDAG-NEXT: v_or_b32_e32 v13, v13, v1
-; SDAG-NEXT: v_or_b32_e32 v14, v9, v3
-; SDAG-NEXT: v_or_b32_e32 v9, v12, v0
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v2
+; SDAG-NEXT: v_or_b32_e32 v14, v7, v3
+; SDAG-NEXT: v_or_b32_e32 v7, v12, v0
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v2
; SDAG-NEXT: .LBB0_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
-; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20
-; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16
-; SDAG-NEXT: v_xor_b32_e32 v4, v18, v3
+; SDAG-NEXT: v_xor_b32_e32 v8, v21, v20
+; SDAG-NEXT: v_xor_b32_e32 v9, v19, v18
+; SDAG-NEXT: v_xor_b32_e32 v4, v16, v3
; SDAG-NEXT: v_xor_b32_e32 v5, v22, v2
-; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3
+; SDAG-NEXT: v_xor_b32_e32 v1, v17, v3
; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2
-; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7
-; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6
-; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7
+; SDAG-NEXT: v_xor_b32_e32 v10, v13, v8
+; SDAG-NEXT: v_xor_b32_e32 v7, v7, v9
+; SDAG-NEXT: v_xor_b32_e32 v11, v14, v8
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc
+; SDAG-NEXT: v_xor_b32_e32 v4, v6, v9
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v9
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v8, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v9, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v8, vcc
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: v_sdiv_v2i128_vv:
@@ -460,8 +448,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ffbh_u32_e32 v29, v11
; GISEL-NEXT: v_ffbh_u32_e32 v30, v20
; GISEL-NEXT: v_ffbh_u32_e32 v31, v21
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; GISEL-NEXT: v_min_u32_e32 v0, v8, v9
; GISEL-NEXT: v_min_u32_e32 v1, v23, v22
; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v28
@@ -470,7 +458,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v3, v31, v3
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -481,25 +469,26 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[16:17]
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
; GISEL-NEXT: v_or_b32_e32 v9, v3, v1
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v9, v22, v16
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v9
+; GISEL-NEXT: v_xor_b32_e32 v9, 1, v9
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT: v_and_b32_e32 v9, 1, v9
-; GISEL-NEXT: v_and_b32_e32 v8, 1, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
@@ -651,8 +640,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17
; GISEL-NEXT: v_ffbh_u32_e32 v28, v13
; GISEL-NEXT: v_ffbh_u32_e32 v29, v12
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; GISEL-NEXT: v_min_u32_e32 v0, v14, v15
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27
; GISEL-NEXT: v_min_u32_e32 v2, v16, v17
@@ -661,7 +650,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v1, v26, v1
; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2
; GISEL-NEXT: v_min_u32_e32 v3, v28, v3
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -671,26 +660,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[10:11]
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v10, v10, v0
; GISEL-NEXT: v_or_b32_e32 v11, v3, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
-; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
+; GISEL-NEXT: v_xor_b32_e32 v11, 1, v11
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
+; GISEL-NEXT: v_or_b32_e32 v16, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v16
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -847,9 +837,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v26, v0
; SDAG-NEXT: v_ffbh_u32_e32 v27, v1
; SDAG-NEXT: v_mov_b32_e32 v28, 0
-; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
@@ -858,7 +848,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: s_and_b64 s[10:11], vcc, s[4:5]
; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
@@ -873,24 +863,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc
; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22
; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[8:9], v[22:23]
; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
; SDAG-NEXT: v_or_b32_e32 v17, v23, v25
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5]
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[16:17]
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB1_6
@@ -1022,18 +1006,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v22, v4
; SDAG-NEXT: v_ffbh_u32_e32 v23, v5
; SDAG-NEXT: v_mov_b32_e32 v24, 0
-; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8
; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v10
; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20
; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_min_u32_e32 v0, v0, v9
; SDAG-NEXT: v_min_u32_e32 v1, v1, v11
; SDAG-NEXT: v_min_u32_e32 v2, v2, v21
; SDAG-NEXT: v_min_u32_e32 v3, v3, v23
+; SDAG-NEXT: s_and_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1
; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
@@ -1048,25 +1032,19 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc
; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[8:9], v[0:1]
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc
; SDAG-NEXT: v_or_b32_e32 v2, v2, v20
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v3, v1, v21
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_and_b32_e32 v2, 1, v8
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
+; SDAG-NEXT: v_or_b32_e32 v3, v1, v21
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
@@ -1214,8 +1192,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f
; GISEL-NEXT: v_mov_b32_e32 v25, 0
; GISEL-NEXT: s_mov_b64 s[8:9], 0
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21
; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23
; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27
@@ -1224,7 +1202,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v3, v22, v3
; GISEL-NEXT: v_min_u32_e32 v18, v26, v18
; GISEL-NEXT: v_min_u32_e32 v19, v29, v19
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2
; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18
@@ -1237,25 +1215,26 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[22:23], v[24:25]
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v2, v2, v20
; GISEL-NEXT: v_or_b32_e32 v3, v23, v21
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[20:21]
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v3, v26, v18
+; GISEL-NEXT: v_and_b32_e32 v18, 1, v3
+; GISEL-NEXT: v_xor_b32_e32 v3, 1, v3
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT: v_and_b32_e32 v3, 1, v3
-; GISEL-NEXT: v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2
+; GISEL-NEXT: v_and_b32_e32 v19, 1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v19
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1387,8 +1366,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ffbh_u32_e32 v25, v6
; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
; GISEL-NEXT: v_mov_b32_e32 v11, 0
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 32, v17
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v21
; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 32, v23
@@ -1397,7 +1376,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v1, v20, v1
; GISEL-NEXT: v_min_u32_e32 v8, v22, v8
; GISEL-NEXT: v_min_u32_e32 v9, v24, v9
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v0
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 64, v8
@@ -1409,26 +1388,27 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[16:17], v[10:11]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v16
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
; GISEL-NEXT: v_or_b32_e32 v9, v17, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
-; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_xor_b32_e32 v9, 1, v9
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
+; GISEL-NEXT: v_or_b32_e32 v20, v9, v8
; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v20
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1564,7 +1544,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x80
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v29, v28
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
@@ -1589,7 +1569,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
; SDAG-NEXT: v_min_u32_e32 v3, v20, v22
; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
@@ -1608,36 +1588,30 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_min_u32_e32 v11, v11, v21
; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20
; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; SDAG-NEXT: v_min_u32_e32 v8, v20, v21
; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11
; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT: s_and_b64 s[8:9], vcc, s[6:7]
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[10:11], v[10:11]
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
; SDAG-NEXT: v_or_b32_e32 v9, v11, v19
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v20, s[4:5]
-; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
-; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB2_6
@@ -1759,7 +1733,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x80
; SDAG-NEXT: v_mov_b32_e32 v35, v26
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc
@@ -1784,7 +1758,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v21, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v19, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[6:7]
; SDAG-NEXT: v_min_u32_e32 v7, v20, v22
; SDAG-NEXT: v_add_i32_e64 v10, s[8:9], 64, v10
; SDAG-NEXT: v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9]
@@ -1803,37 +1777,31 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_add_i32_e32 v13, vcc, 32, v13
; SDAG-NEXT: v_ffbh_u32_e32 v20, v7
; SDAG-NEXT: v_min_u32_e32 v14, v15, v14
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; SDAG-NEXT: v_min_u32_e32 v10, v13, v20
; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v14
; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[4:5]
+; SDAG-NEXT: s_and_b64 s[6:7], vcc, s[6:7]
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc
; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10
; SDAG-NEXT: v_subb_u32_e32 v12, vcc, 0, v18, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[10:11], v[10:11]
; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v18, vcc
; SDAG-NEXT: v_or_b32_e32 v14, v14, v12
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v15, v11, v13
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_and_b32_e32 v14, 1, v18
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
+; SDAG-NEXT: v_or_b32_e32 v15, v11, v13
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
@@ -2058,8 +2026,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ffbh_u32_e32 v25, v11
; GISEL-NEXT: v_ffbh_u32_e32 v26, v8
; GISEL-NEXT: v_ffbh_u32_e32 v27, v9
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
; GISEL-NEXT: v_min_u32_e32 v0, v18, v21
; GISEL-NEXT: v_min_u32_e32 v1, v22, v23
; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v24
@@ -2068,7 +2036,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v3, v27, v3
; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 64, v1
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -2079,25 +2047,26 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v2
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[19:20]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[19:20]
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
; GISEL-NEXT: v_or_b32_e32 v19, v3, v1
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v19, v21, v20
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v19
+; GISEL-NEXT: v_xor_b32_e32 v19, 1, v19
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT: v_and_b32_e32 v19, 1, v19
-; GISEL-NEXT: v_and_b32_e32 v18, 1, v18
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v18
; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2249,8 +2218,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23
; GISEL-NEXT: v_ffbh_u32_e32 v26, v7
; GISEL-NEXT: v_ffbh_u32_e32 v27, v6
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[14:15]
; GISEL-NEXT: v_min_u32_e32 v0, v20, v21
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25
; GISEL-NEXT: v_min_u32_e32 v14, v22, v23
@@ -2259,7 +2228,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v1, v24, v1
; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 64, v14
; GISEL-NEXT: v_min_u32_e32 v15, v26, v15
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -2269,26 +2238,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[2:3]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[14:15], v[2:3]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v14
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v2, v2, v0
; GISEL-NEXT: v_or_b32_e32 v3, v15, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v3, v20, v21
; GISEL-NEXT: v_and_b32_e32 v20, 1, v3
-; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT: v_xor_b32_e32 v3, 1, v3
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v22, 1, v2
+; GISEL-NEXT: v_or_b32_e32 v22, v3, v2
; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v22
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2477,9 +2447,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v26, v0
; SDAG-NEXT: v_ffbh_u32_e32 v27, v1
; SDAG-NEXT: v_mov_b32_e32 v28, 0
-; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
@@ -2488,7 +2458,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: s_and_b64 s[10:11], vcc, s[4:5]
; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
@@ -2503,24 +2473,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc
; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[8:9], v[18:19]
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
; SDAG-NEXT: v_or_b32_e32 v17, v19, v21
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5]
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[16:17]
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7]
; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB3_6
@@ -2652,18 +2616,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v26, v4
; SDAG-NEXT: v_ffbh_u32_e32 v27, v5
; SDAG-NEXT: v_mov_b32_e32 v28, 0
-; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT: s_and_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
@@ -2678,25 +2642,19 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], s[8:9], v[16:17]
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v17, v21
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_and_b32_e32 v18, 1, v22
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
+; SDAG-NEXT: v_or_b32_e32 v19, v17, v21
+; SDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5]
-; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5]
-; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
@@ -2883,8 +2841,8 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f
; GISEL-NEXT: v_mov_b32_e32 v21, 0
; GISEL-NEXT: s_mov_b64 s[8:9], 0
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23
; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25
; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27
@@ -2893,7 +2851,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v17, v24, v17
; GISEL-NEXT: v_min_u32_e32 v18, v26, v18
; GISEL-NEXT: v_min_u32_e32 v19, v29, v19
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16
; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18
@@ -2906,25 +2864,26 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_xor_b32_e32 v23, 0x7f, v18
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[18:19], v[20:21]
; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v20, v23, v16
; GISEL-NEXT: v_or_b32_e32 v21, v19, v17
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v21, v22, v23
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v21
+; GISEL-NEXT: v_xor_b32_e32 v21, 1, v21
; GISEL-NEXT: v_or_b32_e32 v20, v21, v20
-; GISEL-NEXT: v_and_b32_e32 v21, 1, v21
-; GISEL-NEXT: v_and_b32_e32 v20, 1, v20
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v20
; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v22
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -3056,8 +3015,8 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_ffbh_u32_e32 v31, v6
; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f
; GISEL-NEXT: v_mov_b32_e32 v25, 0
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23
; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v27
; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v29
@@ -3066,7 +3025,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_min_u32_e32 v17, v26, v17
; GISEL-NEXT: v_min_u32_e32 v18, v28, v18
; GISEL-NEXT: v_min_u32_e32 v19, v30, v19
-; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16
; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18
@@ -3078,26 +3037,27 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, v[22:23], v[24:25]
; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v22
-; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v18, v18, v16
; GISEL-NEXT: v_or_b32_e32 v19, v23, v17
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GISEL-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc
-; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_or_b32_e32 v19, v26, v24
; GISEL-NEXT: v_and_b32_e32 v24, 1, v19
-; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
+; GISEL-NEXT: v_xor_b32_e32 v19, 1, v19
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc
-; GISEL-NEXT: v_and_b32_e32 v26, 1, v18
+; GISEL-NEXT: v_or_b32_e32 v26, v19, v18
; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v26, 1, v26
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014e33e7..2bfe5492263d3 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -27,27 +27,27 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK-NEXT: s_mov_b32 s48, 0
; CHECK-NEXT: .LBB0_1: ; %bb3
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_cmp_eq_u32 s46, 0
+; CHECK-NEXT: s_cmp_lg_u32 s46, 0
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_mov_b32 s50, s48
-; CHECK-NEXT: s_cselect_b32 s51, 0, s1
-; CHECK-NEXT: s_cselect_b32 s55, 0, s35
+; CHECK-NEXT: s_cselect_b32 s51, s1, 0
+; CHECK-NEXT: s_cselect_b32 s55, s35, 0
; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
-; CHECK-NEXT: s_cselect_b32 s52, 0, s2
-; CHECK-NEXT: s_cselect_b32 s56, 0, s36
-; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43
+; CHECK-NEXT: s_cselect_b32 s52, s2, 0
+; CHECK-NEXT: s_cselect_b32 s56, s36, 0
+; CHECK-NEXT: s_cselect_b32 vcc_lo, s43, 0
; CHECK-NEXT: v_mov_b32_e32 v4, s50
-; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0
-; CHECK-NEXT: s_cselect_b32 s53, 0, s3
-; CHECK-NEXT: s_cselect_b32 s54, 0, s34
-; CHECK-NEXT: s_cselect_b32 s57, 0, s37
-; CHECK-NEXT: s_cselect_b32 s58, 0, s38
-; CHECK-NEXT: s_cselect_b32 s59, 0, s0
-; CHECK-NEXT: s_cselect_b32 s60, 0, s39
-; CHECK-NEXT: s_cselect_b32 s61, 0, s40
-; CHECK-NEXT: s_cselect_b32 s62, 0, s41
-; CHECK-NEXT: s_cselect_b32 s63, 0, s42
-; CHECK-NEXT: s_cselect_b32 vcc_hi, 0, s44
+; CHECK-NEXT: s_cselect_b32 s47, 0xf0, s45
+; CHECK-NEXT: s_cselect_b32 s53, s3, 0
+; CHECK-NEXT: s_cselect_b32 s54, s34, 0
+; CHECK-NEXT: s_cselect_b32 s57, s37, 0
+; CHECK-NEXT: s_cselect_b32 s58, s38, 0
+; CHECK-NEXT: s_cselect_b32 s59, s0, 0
+; CHECK-NEXT: s_cselect_b32 s60, s39, 0
+; CHECK-NEXT: s_cselect_b32 s61, s40, 0
+; CHECK-NEXT: s_cselect_b32 s62, s41, 0
+; CHECK-NEXT: s_cselect_b32 s63, s42, 0
+; CHECK-NEXT: s_cselect_b32 vcc_hi, s44, 0
; CHECK-NEXT: s_mov_b32 s46, s48
; CHECK-NEXT: scratch_store_b32 off, v0, s51
; CHECK-NEXT: scratch_store_b32 off, v0, s52
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6512bee36e88b..59888b614a837 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -11,31 +11,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9: ; %bb.0: ; %_udiv-special-cases
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v1, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v2, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v3, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v6, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v7, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v21, v20
-; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc
+; GFX9-NEXT: v_ffbh_u32_e32 v13, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc
; GFX9-NEXT: v_or_b32_e32 v7, v22, v5
; GFX9-NEXT: v_or_b32_e32 v6, v23, v4
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v7, v1, v3
-; GFX9-NEXT: v_or_b32_e32 v6, v0, v2
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[6:7]
; GFX9-NEXT: v_ffbh_u32_e32 v6, v4
; GFX9-NEXT: v_add_u32_e32 v6, 32, v6
; GFX9-NEXT: v_ffbh_u32_e32 v7, v5
@@ -44,50 +41,47 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
; GFX9-NEXT: v_ffbh_u32_e32 v8, v22
; GFX9-NEXT: v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 64, v7
; GFX9-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT: v_ffbh_u32_e32 v10, v3
+; GFX9-NEXT: v_ffbh_u32_e32 v12, v3
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; GFX9-NEXT: v_ffbh_u32_e32 v7, v2
; GFX9-NEXT: v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT: v_min_u32_e32 v7, v7, v10
-; GFX9-NEXT: v_ffbh_u32_e32 v10, v0
-; GFX9-NEXT: v_add_u32_e32 v10, 32, v10
-; GFX9-NEXT: v_ffbh_u32_e32 v11, v1
-; GFX9-NEXT: v_min_u32_e32 v10, v10, v11
+; GFX9-NEXT: v_min_u32_e32 v7, v7, v12
+; GFX9-NEXT: v_ffbh_u32_e32 v12, v0
+; GFX9-NEXT: v_add_u32_e32 v12, 32, v12
+; GFX9-NEXT: v_min_u32_e32 v12, v12, v13
; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 64, v10
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, 0, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 64, v12
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[6:7], 0, 0, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v11, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v13, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT: v_or_b32_e32 v11, v1, v3
+; GFX9-NEXT: v_or_b32_e32 v10, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
-; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v12, v7, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
-; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT: v_or_b32_e32 v11, v11, v8
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12]
-; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[8:9]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[10:11]
+; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6
+; GFX9-NEXT: v_or_b32_e32 v11, v7, v9
+; GFX9-NEXT: v_or_b32_e32 v10, v10, v8
+; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_6
; GFX9-NEXT: ; %bb.1: ; %udiv-bb1
@@ -1503,10 +1497,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v9, v5, v7
; GFX9-NEXT: v_or_b32_e32 v8, v4, v6
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v9, v1, v3
-; GFX9-NEXT: v_or_b32_e32 v8, v0, v2
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
; GFX9-NEXT: v_ffbh_u32_e32 v8, v6
; GFX9-NEXT: v_add_u32_e32 v8, 32, v8
; GFX9-NEXT: v_ffbh_u32_e32 v9, v7
@@ -1515,7 +1506,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_add_u32_e32 v9, 32, v9
; GFX9-NEXT: v_ffbh_u32_e32 v10, v5
; GFX9-NEXT: v_min_u32_e32 v9, v9, v10
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9
; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
@@ -1526,39 +1516,37 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_min_u32_e32 v9, v9, v11
; GFX9-NEXT: v_ffbh_u32_e32 v11, v0
; GFX9-NEXT: v_add_u32_e32 v11, 32, v11
-; GFX9-NEXT: v_ffbh_u32_e32 v12, v1
-; GFX9-NEXT: v_min_u32_e32 v11, v11, v12
+; GFX9-NEXT: v_ffbh_u32_e32 v14, v1
+; GFX9-NEXT: v_min_u32_e32 v11, v11, v14
; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 64, v11
-; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v14, s[6:7], 0, 0, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
+; GFX9-NEXT: v_or_b32_e32 v13, v1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v9
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v12, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v14, vcc
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v11, vcc
+; GFX9-NEXT: v_or_b32_e32 v12, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; GFX9-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT: s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[8:9], 0, v[12:13]
; GFX9-NEXT: v_xor_b32_e32 v12, 0x7f, v8
; GFX9-NEXT: v_or_b32_e32 v13, v9, v11
; GFX9-NEXT: v_or_b32_e32 v12, v12, v10
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v15, v3, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v13, v1, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB1_6
; GFX9-NEXT: ; %bb.1: ; %udiv-bb1
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 7ed27f008083e..d8163cdebad8c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -1015,12 +1015,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; VI-LABEL: add_select_negk_fabs_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0xbc00
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0xbc00
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1029,12 +1029,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; GFX9-LABEL: add_select_negk_fabs_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xbc00
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xbc00
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_pk_add_f16 v0, v0, v3
@@ -1043,12 +1043,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fabs_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v0.h, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1071,12 +1071,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fabs_v2f16:
; GFX11-NSZ-TRUE16: ; %bb.0:
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v0.h, s0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1126,12 +1126,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
; VI-LABEL: add_select_negliteralk_fabs_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0xe400
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0xe400
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1140,12 +1140,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
; GFX9-LABEL: add_select_negliteralk_fabs_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xe400
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xe400
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_pk_add_f16 v0, v0, v3
@@ -1154,12 +1154,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xe400, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xe400, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xe400, v0.h, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1182,12 +1182,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16:
; GFX11-NSZ-TRUE16: ; %bb.0:
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xe400, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xe400, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xe400, v0.h, s0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1346,12 +1346,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; VI-LABEL: add_select_posk_fabs_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x3c00
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1360,12 +1360,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; GFX9-LABEL: add_select_posk_fabs_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x3c00
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_pk_add_f16 v0, v0, v3
@@ -1374,12 +1374,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fabs_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v0.h, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -1402,12 +1402,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fabs_v2f16:
; GFX11-NSZ-TRUE16: ; %bb.0:
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v0.h, s0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v3
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -3836,12 +3836,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; VI-LABEL: mul_select_posk_negfabs_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0x4400
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_mul_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -3850,12 +3850,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; GFX9-LABEL: mul_select_posk_negfabs_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x4400
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x4400
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3
@@ -3864,12 +3864,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; GFX11-SAFE-TRUE16-LABEL: mul_select_posk_negfabs_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v0, 0x80008000, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4400, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4400, v0.h, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v3
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -3892,12 +3892,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; GFX11-NSZ-TRUE16-LABEL: mul_select_posk_negfabs_v2f16:
; GFX11-NSZ-TRUE16: ; %bb.0:
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v0, 0x80008000, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4400, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4400, v0.h, s0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v3
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -4066,12 +4066,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; VI-LABEL: mul_select_negk_negfabs_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT: v_mov_b32_e32 v4, 0xc400
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT: v_mov_b32_e32 v2, 0xc400
+; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_mul_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4080,12 +4080,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; GFX9-LABEL: mul_select_negk_negfabs_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xc400
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xc400
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3
@@ -4094,12 +4094,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; GFX11-SAFE-TRUE16-LABEL: mul_select_negk_negfabs_v2f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v0, 0x80008000, v2
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xc400, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xc400, v0.h, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v3
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -4122,12 +4122,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
; GFX11-NSZ-TRUE16-LABEL: mul_select_negk_negfabs_v2f16:
; GFX11-NSZ-TRUE16: ; %bb.0:
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v0, 0x80008000, v2
; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xc400, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xc400, v0.h, s0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v3
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
index 12ccdfff07c6f..71e41659d41dd 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -40,9 +40,8 @@ define amdgpu_cs void @test_i32_sle(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_i32_sgt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_i32_sgt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 2, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
; GCN-NEXT: s_endpgm
.entry:
@@ -58,9 +57,8 @@ define amdgpu_cs void @test_i32_sgt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_i32_slt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_i32_slt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_lt_i32_e32 vcc_lo, 2, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 3, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
; GCN-NEXT: s_endpgm
.entry:
@@ -113,11 +111,9 @@ define amdgpu_cs void @test_i64_sle(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_i64_sgt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_i64_sgt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_lt_i64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
@@ -133,11 +129,9 @@ define amdgpu_cs void @test_i64_sgt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_i64_slt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_i64_slt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
@@ -154,9 +148,8 @@ define amdgpu_cs void @test_i64_slt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_u32_eq(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u32_eq:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
; GCN-NEXT: s_endpgm
.entry:
@@ -190,11 +183,10 @@ define amdgpu_cs void @test_negative_case(i32 %a, i32 %p, i32 %q, ptr addrspace(
define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr addrspace(1) %out) {
; GCN-LABEL: test_mixed:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_dual_cndmask_b32 v2, 0, v3 :: v_dual_cndmask_b32 v3, 0, v4
; GCN-NEXT: global_store_b128 v[5:6], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
@@ -214,10 +206,10 @@ define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr ad
define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr addrspace(1) %out) {
; GCN-LABEL: test_sgpr:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v0
-; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v5, s0, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v6, s1, 0, vcc_lo
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v5, 0, s0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v6, 0, s1, vcc_lo
; GCN-NEXT: global_store_b96 v[2:3], v[4:6], off
; GCN-NEXT: s_endpgm
.entry:
@@ -235,9 +227,8 @@ define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr
define amdgpu_cs void @test_u32_ne(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u32_ne:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
; GCN-NEXT: s_endpgm
.entry:
@@ -287,9 +278,8 @@ define amdgpu_cs void @test_u32_ule(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_u32_ugt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u32_ugt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
; GCN-NEXT: s_endpgm
.entry:
@@ -305,9 +295,8 @@ define amdgpu_cs void @test_u32_ugt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_u32_ult(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u32_ult:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, 2, v0
-; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 3, v0
+; GCN-NEXT: v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
; GCN-NEXT: global_store_b64 v[3:4], v[0:1], off
; GCN-NEXT: s_endpgm
.entry:
@@ -324,11 +313,9 @@ define amdgpu_cs void @test_u32_ult(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_u64_eq(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u64_eq:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, 1, v[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_ne_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
@@ -344,11 +331,9 @@ define amdgpu_cs void @test_u64_eq(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out
define amdgpu_cs void @test_u64_ne(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u64_ne:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u64_e32 vcc_lo, 1, v[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
@@ -400,11 +385,9 @@ define amdgpu_cs void @test_u64_ule(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_u64_ugt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u64_ugt:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_gt_u64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
@@ -420,11 +403,9 @@ define amdgpu_cs void @test_u64_ugt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
define amdgpu_cs void @test_u64_ult(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
; GCN-LABEL: test_u64_ult:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT: v_cmp_gt_u64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT: v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
; GCN-NEXT: global_store_b128 v[6:7], v[0:3], off
; GCN-NEXT: s_endpgm
.entry:
>From 2589eb30bbb3d126807a544474874877a0f034ce Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 15:30:21 +0200
Subject: [PATCH 3/3] replaced isa<Constant> check with isDivergent check
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 9 +-
.../atomic_optimizations_local_pointer.ll | 482 +++++++++---------
llvm/test/CodeGen/AMDGPU/ctlz.ll | 4 +-
llvm/test/CodeGen/AMDGPU/cttz.ll | 4 +-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 12 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 90 ++--
.../issue130120-eliminate-frame-index.ll | 34 +-
llvm/test/CodeGen/AMDGPU/known-never-snan.ll | 8 +-
llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll | 8 +-
9 files changed, 326 insertions(+), 325 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7f17132be12aa..c3f9533d36323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1787,11 +1787,12 @@ bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
auto SelectI = dyn_cast<SelectInst>(User);
- if (isa<Constant>(SelectI->getOperand(1)) &&
- !isa<Constant>(SelectI->getOperand(2)))
+ auto Op1 = SelectI->getOperand(1);
+ auto Op2 = SelectI->getOperand(2);
+
+ if (!UA.isDivergent(Op1) && UA.isDivergent(Op2))
ShouldSwap++;
- else if (!isa<Constant>(SelectI->getOperand(1)) &&
- isa<Constant>(SelectI->getOperand(2)))
+ else if (UA.isDivergent(Op1) && !UA.isDivergent(Op2))
ShouldSwap--;
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 8e0b3cb9aa1d5..27a0b5e3a48bd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -10964,10 +10964,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -10995,13 +10995,13 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
@@ -11028,13 +11028,13 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@@ -11062,9 +11062,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1064-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -11093,9 +11093,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1032-NEXT: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -11126,9 +11126,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -11157,9 +11157,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -11221,10 +11221,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_endpgm
@@ -11274,13 +11274,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[1:2]
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX8_ITERATIVE-NEXT: s_endpgm
@@ -11329,13 +11329,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[1:2]
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX9_ITERATIVE-NEXT: s_endpgm
@@ -11383,9 +11383,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,9 +11434,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11492,9 +11492,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11546,9 +11546,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cmp_le_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -11647,13 +11647,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s2, -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX8_DPP-NEXT: s_endpgm
@@ -11736,13 +11736,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s2, -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX9_DPP-NEXT: s_endpgm
@@ -11848,9 +11848,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1064_DPP-NEXT: s_endpgm
@@ -11933,9 +11933,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT: v_cmp_le_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1032_DPP-NEXT: s_endpgm
@@ -12055,9 +12055,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT: v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -12145,9 +12145,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT: v_cmp_le_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
@@ -12790,10 +12790,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -12821,13 +12821,13 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
@@ -12854,13 +12854,13 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@@ -12888,9 +12888,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1064-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -12919,9 +12919,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1032-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -12952,9 +12952,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -12983,9 +12983,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -13047,10 +13047,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_endpgm
@@ -13100,13 +13100,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[1:2]
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX8_ITERATIVE-NEXT: s_endpgm
@@ -13155,13 +13155,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[1:2]
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX9_ITERATIVE-NEXT: s_endpgm
@@ -13209,9 +13209,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13260,9 +13260,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13318,9 +13318,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13372,9 +13372,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -13473,13 +13473,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s2, -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX8_DPP-NEXT: s_endpgm
@@ -13562,13 +13562,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s2, -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX9_DPP-NEXT: s_endpgm
@@ -13674,9 +13674,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1064_DPP-NEXT: s_endpgm
@@ -13759,9 +13759,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1032_DPP-NEXT: s_endpgm
@@ -13881,9 +13881,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -13971,9 +13971,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT: v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
@@ -14614,10 +14614,10 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -14644,13 +14644,13 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
@@ -14676,13 +14676,13 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@@ -14710,9 +14710,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
+; GFX1064-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -14741,9 +14741,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
+; GFX1032-NEXT: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -14774,9 +14774,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
+; GFX1164-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -14802,12 +14802,12 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
+; GFX1132-NEXT: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -14868,10 +14868,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_endpgm
@@ -14920,13 +14920,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[1:2]
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX8_ITERATIVE-NEXT: s_endpgm
@@ -14974,13 +14974,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[1:2]
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX9_ITERATIVE-NEXT: s_endpgm
@@ -15027,9 +15027,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15077,9 +15077,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15135,9 +15135,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15189,9 +15189,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -15291,13 +15291,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v5
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
+; GFX8_DPP-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[5:6]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s2, -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
; GFX8_DPP-NEXT: s_endpgm
@@ -15381,13 +15381,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v5
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
+; GFX9_DPP-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[5:6]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s2, -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
; GFX9_DPP-NEXT: s_endpgm
@@ -15493,9 +15493,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1064_DPP-NEXT: s_endpgm
@@ -15578,9 +15578,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT: v_cmp_le_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1032_DPP-NEXT: s_endpgm
@@ -15700,9 +15700,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT: v_cmp_le_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -15784,9 +15784,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT: v_cmp_le_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
@@ -16428,10 +16428,10 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -16458,13 +16458,13 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
@@ -16490,13 +16490,13 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@@ -16524,9 +16524,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1064-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -16555,9 +16555,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1032-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -16588,9 +16588,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -16619,9 +16619,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
@@ -16682,10 +16682,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX7LESS_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX7LESS_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX7LESS_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX7LESS_ITERATIVE-NEXT: s_endpgm
@@ -16734,13 +16734,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX8_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[1:2]
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX8_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX8_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX8_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX8_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX8_ITERATIVE-NEXT: s_endpgm
@@ -16788,13 +16788,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s5, v4
; GFX9_ITERATIVE-NEXT: v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[1:2]
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
; GFX9_ITERATIVE-NEXT: v_mov_b32_e32 v0, s4
; GFX9_ITERATIVE-NEXT: s_mov_b32 s3, 0xf000
; GFX9_ITERATIVE-NEXT: s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GFX9_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_ITERATIVE-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
; GFX9_ITERATIVE-NEXT: s_endpgm
@@ -16841,9 +16841,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1064_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1064_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16891,9 +16891,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v4
; GFX1032_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1032_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -16949,9 +16949,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -17003,9 +17003,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s3, v3
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
@@ -17104,13 +17104,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v1
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; GFX8_DPP-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s2, -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0
; GFX8_DPP-NEXT: s_endpgm
@@ -17193,13 +17193,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v1
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; GFX9_DPP-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v0, v7, vcc
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s2, -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v6, v0, v6, vcc
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: buffer_store_dwordx2 v[6:7], off, s[0:3], 0
; GFX9_DPP-NEXT: s_endpgm
@@ -17305,9 +17305,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1064_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1064_DPP-NEXT: s_endpgm
@@ -17390,9 +17390,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1032_DPP-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
; GFX1032_DPP-NEXT: s_endpgm
@@ -17512,9 +17512,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1164_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1164_DPP-NEXT: s_endpgm
@@ -17596,9 +17596,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
; GFX1132_DPP-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT: v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT: v_cndmask_b32_e32 v7, s4, v7, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[0:3], 0
; GFX1132_DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index e3cc8ee340f0c..b9a6f782fc682 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1676,10 +1676,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-GISEL-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, -16
-; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index d997904d81d54..cabe0a017df8c 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1455,9 +1455,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
-; GFX10-GISEL-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%val = load i16, ptr addrspace(1) %valptr
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index c1abc3002a990..97bcd8b5ee68a 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1168,8 +1168,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT: v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT: v_cmp_ne_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-GISEL-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%val = load i16, ptr addrspace(1) %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index ffe0596a95e33..e45dd57554675 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -12225,9 +12225,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: v_mov_b32_e32 v4, s3
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: s_endpgm
@@ -12278,9 +12278,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-NEXT: v_mov_b32_e32 v5, s2
; GCN2-NEXT: v_mov_b32_e32 v4, s3
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
@@ -12317,9 +12317,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -12376,9 +12376,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: v_mov_b32_e32 v4, s3
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: s_endpgm
@@ -12429,9 +12429,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN2-NEXT: v_mov_b32_e32 v5, s2
; GCN2-NEXT: v_mov_b32_e32 v4, s3
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
@@ -12468,9 +12468,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -12680,9 +12680,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GCN1-NEXT: v_mov_b32_e32 v4, s11
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN1-NEXT: buffer_store_dword v0, v2, s[16:19], 0 offen
; GCN1-NEXT: buffer_store_dword v1, v3, s[16:19], 0 offen
; GCN1-NEXT: s_endpgm
@@ -12735,9 +12735,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GCN2-NEXT: v_mov_b32_e32 v5, s10
; GCN2-NEXT: v_mov_b32_e32 v4, s11
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
@@ -12775,9 +12775,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -12998,9 +12998,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: v_mov_b32_e32 v4, s3
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: s_endpgm
@@ -13049,9 +13049,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN2-NEXT: v_mov_b32_e32 v5, s2
; GCN2-NEXT: v_mov_b32_e32 v4, s3
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
@@ -13087,9 +13087,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
@@ -13290,9 +13290,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen
; GCN1-NEXT: v_mov_b32_e32 v4, s11
; GCN1-NEXT: s_waitcnt vmcnt(0)
-; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN1-NEXT: buffer_store_dword v0, v2, s[16:19], 0 offen
; GCN1-NEXT: buffer_store_dword v1, v3, s[16:19], 0 offen
; GCN1-NEXT: s_endpgm
@@ -13343,9 +13343,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GCN2-NEXT: v_mov_b32_e32 v5, s10
; GCN2-NEXT: v_mov_b32_e32 v4, s11
; GCN2-NEXT: s_waitcnt vmcnt(0)
-; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
@@ -13382,9 +13382,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GFX12-NEXT: s_cselect_b32 s0, s0, -1
; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 2bfe5492263d3..1c298014e33e7 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -27,27 +27,27 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK-NEXT: s_mov_b32 s48, 0
; CHECK-NEXT: .LBB0_1: ; %bb3
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_cmp_lg_u32 s46, 0
+; CHECK-NEXT: s_cmp_eq_u32 s46, 0
; CHECK-NEXT: s_mov_b32 s49, s48
; CHECK-NEXT: s_mov_b32 s50, s48
-; CHECK-NEXT: s_cselect_b32 s51, s1, 0
-; CHECK-NEXT: s_cselect_b32 s55, s35, 0
+; CHECK-NEXT: s_cselect_b32 s51, 0, s1
+; CHECK-NEXT: s_cselect_b32 s55, 0, s35
; CHECK-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
-; CHECK-NEXT: s_cselect_b32 s52, s2, 0
-; CHECK-NEXT: s_cselect_b32 s56, s36, 0
-; CHECK-NEXT: s_cselect_b32 vcc_lo, s43, 0
+; CHECK-NEXT: s_cselect_b32 s52, 0, s2
+; CHECK-NEXT: s_cselect_b32 s56, 0, s36
+; CHECK-NEXT: s_cselect_b32 vcc_lo, 0, s43
; CHECK-NEXT: v_mov_b32_e32 v4, s50
-; CHECK-NEXT: s_cselect_b32 s47, 0xf0, s45
-; CHECK-NEXT: s_cselect_b32 s53, s3, 0
-; CHECK-NEXT: s_cselect_b32 s54, s34, 0
-; CHECK-NEXT: s_cselect_b32 s57, s37, 0
-; CHECK-NEXT: s_cselect_b32 s58, s38, 0
-; CHECK-NEXT: s_cselect_b32 s59, s0, 0
-; CHECK-NEXT: s_cselect_b32 s60, s39, 0
-; CHECK-NEXT: s_cselect_b32 s61, s40, 0
-; CHECK-NEXT: s_cselect_b32 s62, s41, 0
-; CHECK-NEXT: s_cselect_b32 s63, s42, 0
-; CHECK-NEXT: s_cselect_b32 vcc_hi, s44, 0
+; CHECK-NEXT: s_cselect_b32 s47, s45, 0xf0
+; CHECK-NEXT: s_cselect_b32 s53, 0, s3
+; CHECK-NEXT: s_cselect_b32 s54, 0, s34
+; CHECK-NEXT: s_cselect_b32 s57, 0, s37
+; CHECK-NEXT: s_cselect_b32 s58, 0, s38
+; CHECK-NEXT: s_cselect_b32 s59, 0, s0
+; CHECK-NEXT: s_cselect_b32 s60, 0, s39
+; CHECK-NEXT: s_cselect_b32 s61, 0, s40
+; CHECK-NEXT: s_cselect_b32 s62, 0, s41
+; CHECK-NEXT: s_cselect_b32 s63, 0, s42
+; CHECK-NEXT: s_cselect_b32 vcc_hi, 0, s44
; CHECK-NEXT: s_mov_b32 s46, s48
; CHECK-NEXT: scratch_store_b32 off, v0, s51
; CHECK-NEXT: scratch_store_b32 off, v0, s52
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
index 64948c374e4dd..2bbfa8f7a47ed 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -228,8 +228,8 @@ define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
@@ -264,8 +264,8 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
index 71e41659d41dd..2051e8011d296 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -206,10 +206,10 @@ define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr ad
define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr addrspace(1) %out) {
; GCN-LABEL: test_sgpr:
; GCN: ; %bb.0: ; %.entry
-; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v5, 0, s0, vcc_lo
-; GCN-NEXT: v_cndmask_b32_e64 v6, 0, s1, vcc_lo
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v5, s0, 0, vcc_lo
+; GCN-NEXT: v_cndmask_b32_e64 v6, s1, 0, vcc_lo
; GCN-NEXT: global_store_b96 v[2:3], v[4:6], off
; GCN-NEXT: s_endpgm
.entry:
More information about the llvm-commits
mailing list