[llvm] 1c9a93a - [GlobalIsel][AMDGPU] Changing legalize rule for G_{UADDO|UADDE|USUBO|USUBE|SADDE|SSUBE}
Yashwant Singh via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 14 10:13:02 PST 2022
Author: Yashwant Singh
Date: 2022-11-14T23:42:23+05:30
New Revision: 1c9a93ae3ad0d8d085efe3af38ca65e4a7b2f307
URL: https://github.com/llvm/llvm-project/commit/1c9a93ae3ad0d8d085efe3af38ca65e4a7b2f307
DIFF: https://github.com/llvm/llvm-project/commit/1c9a93ae3ad0d8d085efe3af38ca65e4a7b2f307.diff
LOG: [GlobalIsel][AMDGPU] Changing legalize rule for G_{UADDO|UADDE|USUBO|USUBE|SADDE|SSUBE}
Generic add and sub with carry are now legalized in a way to explicitly calculate carry/borrow output. i.e
%6:_(s64), %7:_(s1) = G_UADDO %0, %1
becomes,
%13:_(s32), %14:_(s1) = G_UADDO %2, %4
%15:_(s32), %16:_(s1) = G_UADDE %3, %5, %14
%6:_(s64) = G_MERGE_VALUES %13(s32), %15(s32)
%7:_(s1) = G_ICMP intpred(ult), %6(s64), %1
Here G_MERGE and G_ICMP instructions are redundant for recalculating carry output. (Similar case for sub with borrow)
This change fix this.
Reviewed By: arsenm, #amdgpu
Differential Revision: https://reviews.llvm.org/D137932
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 79dc60c93f403..6b05b017bc6f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -656,12 +656,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0)
.scalarize(0);
- getActionDefinitionsBuilder({G_UADDO, G_USUBO,
- G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
- .legalFor({{S32, S1}, {S32, S32}})
- .minScalar(0, S32)
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder(
+ {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
+ .legalFor({{S32, S1}, {S32, S32}})
+ .clampScalar(0, S32, S32)
+ .scalarize(0);
getActionDefinitionsBuilder(G_BITCAST)
// Don't worry about the size constraint.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index c8d3b6d7b894f..a1013f3803e78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -41,7 +41,6 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -52,7 +51,6 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -63,7 +61,6 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -480,46 +477,28 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_uaddo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_add_u32 s0, s0, s2
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: s_addc_u32 s1, s1, s3
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: s_cselect_b32 s2, 1, 0
+; GFX7-NEXT: s_add_u32 s0, s0, s2
+; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cselect_b32 s2, 1, 0
+; GFX8-NEXT: s_add_u32 s0, s0, s2
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_addc_u32 s1, s1, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_add_u32 s0, s0, s2
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: ; return to shader part epilog
%uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%add = extractvalue {i64, i1} %uaddo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir
index ae8c16ab76d11..a276a5210b6f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sadde.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_sadde_s32
@@ -120,9 +120,13 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]]
- ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s1) = G_SADDE [[COPY]], [[COPY1]], [[ICMP]]
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV]], [[UV2]], [[ICMP]]
+ ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s32), [[SADDE1:%[0-9]+]]:_(s1) = G_SADDE [[UV1]], [[UV3]], [[UADDE1]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE]](s32), [[SADDE]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SADDE1]](s1)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[SADDE]](s64)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir
index b18454e6fec37..1698450ea18c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssube.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_ssube_s32
@@ -119,9 +119,13 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr4
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]]
- ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s1) = G_SSUBE [[COPY]], [[COPY1]], [[ICMP]]
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+ ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
+ ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV]], [[UV2]], [[ICMP]]
+ ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s32), [[SSUBE1:%[0-9]+]]:_(s1) = G_SSUBE [[UV1]], [[UV3]], [[USUBE1]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBE]](s32), [[SSUBE]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[SSUBE1]](s1)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[SSUBE]](s64)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir
index 2d0bc0535fe75..9f43d664d1edd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uadde.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_uadde_s32
@@ -87,9 +87,9 @@ body: |
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %13, [[C1]]
- ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[AND]], [[AND1]], [[ICMP]]
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UADDE]], [[C1]]
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UADDE]](s32), [[AND2]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+ ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[AND]], [[COPY2]], [[ICMP]]
+ ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UADDE]](s32), [[AND1]]
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP1]](s1)
; CHECK-NEXT: $vgpr0 = COPY [[UADDE]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
@@ -123,17 +123,12 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
- ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64)
- ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UADDO]], [[UV4]]
- ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[UV5]], [[UADDO3]]
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY]]
- ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP1]](s1)
+ ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV]], [[UV2]], [[ICMP]]
+ ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDE1]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE]](s32), [[UADDE2]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDE3]](s1)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT1]](s32)
+ ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s32) = COPY $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
index a4e9dfe742e64..54ce45c0dc088 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_uaddo_s32
@@ -41,9 +41,9 @@ body: |
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[AND3]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -74,9 +74,9 @@ body: |
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[AND3]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -106,8 +106,7 @@ body: |
; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UADDE1]](s1)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
@@ -145,17 +144,17 @@ body: |
; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND5]]
- ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
- ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
- ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32)
+ ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+ ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s16>) = COPY $vgpr0
@@ -209,27 +208,27 @@ body: |
; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>)
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
- ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
- ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
- ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32)
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL1]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
+ ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+ ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+ ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
- ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
- ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32)
- ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL2]]
+ ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
+ ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+ ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
+ ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL2]]
; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
- ; CHECK-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
- ; CHECK-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND15]](s32), [[AND16]](s32), [[AND17]](s32)
+ ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+ ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+ ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32)
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
@@ -287,15 +286,15 @@ body: |
; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[AND9]], [[AND10]]
; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C1]]
; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD3]](s32), [[AND11]]
- ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
- ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
- ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C1]]
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C]](s32)
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND14]], [[SHL1]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND11]](s32)
+ ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32)
+ ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
@@ -303,11 +302,11 @@ body: |
; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1)
; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP3]](s1)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
- ; CHECK-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
- ; CHECK-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
- ; CHECK-NEXT: [[AND19:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND16]](s32), [[AND17]](s32), [[AND18]](s32), [[AND19]](s32)
+ ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+ ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+ ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+ ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32), [[AND15]](s32)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
index 869cccad2b5f5..ca2dacce5457e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
@@ -720,9 +720,8 @@ body: |
; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]]
; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
; GFX8-LABEL: name: uaddsat_s64
; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -734,9 +733,8 @@ body: |
; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]]
; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
; GFX9-LABEL: name: uaddsat_s64
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -748,9 +746,8 @@ body: |
; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV]], [[UV2]]
; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[COPY1]]
; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
@@ -776,16 +773,14 @@ body: |
; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[UV2]]
; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]]
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV1]](s64), [[UV3]]
- ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]]
+ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]]
; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; GFX8-LABEL: name: uaddsat_v2s64
@@ -800,16 +795,14 @@ body: |
; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]]
; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[UV2]]
; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]]
; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV1]](s64), [[UV3]]
- ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]]
+ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]]
; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; GFX9-LABEL: name: uaddsat_v2s64
@@ -824,16 +817,14 @@ body: |
; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]]
; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV]](s64), [[UV2]]
; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]]
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[MV1]](s64), [[UV3]]
- ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir
index 302566a112579..e630eaefced33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usube.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_usube_s32
@@ -87,9 +87,9 @@ body: |
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %13, [[C1]]
- ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[AND]], [[AND1]], [[ICMP]]
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[USUBE]], [[C1]]
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[USUBE]](s32), [[AND2]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND1]](s32)
+ ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[AND]], [[COPY2]], [[ICMP]]
+ ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[USUBE]](s32), [[AND1]]
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP1]](s1)
; CHECK-NEXT: $vgpr0 = COPY [[USUBE]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
@@ -123,23 +123,12 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]]
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
- ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
- ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ZEXT]](s64)
- ; CHECK-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[USUBO]], [[UV4]]
- ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[USUBE]], [[UV5]], [[USUBO3]]
- ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
- ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ICMP]](s1)
- ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[ICMP2]](s1)
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[ANYEXT]], [[ANYEXT1]]
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
+ ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV]], [[UV2]], [[ICMP]]
+ ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBE1]]
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBE]](s32), [[USUBE2]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBE3]](s1)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
- ; CHECK-NEXT: $vgpr2 = COPY [[AND]](s32)
+ ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s32) = COPY $vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
index 51737b33b9e95..249695a761636 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_usubo_s32
@@ -41,9 +41,9 @@ body: |
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[AND1]]
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[AND3]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -74,9 +74,9 @@ body: |
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[AND1]]
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[AND3]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -106,8 +106,7 @@ body: |
; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[USUBE1]](s1)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
@@ -145,17 +144,17 @@ body: |
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[AND4]]
; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND5]]
- ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
- ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
- ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32)
+ ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+ ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s16>) = COPY $vgpr0
@@ -209,27 +208,27 @@ body: |
; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>)
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
- ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
- ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
- ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32)
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL1]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
+ ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+ ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+ ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
- ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
- ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32)
- ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL2]]
+ ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
+ ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+ ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
+ ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL2]]
; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
- ; CHECK-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
- ; CHECK-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND15]](s32), [[AND16]](s32), [[AND17]](s32)
+ ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+ ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+ ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32)
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
%0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
@@ -287,15 +286,15 @@ body: |
; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[AND9]], [[AND10]]
; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]]
; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB3]](s32), [[AND11]]
- ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
- ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
- ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]]
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C]](s32)
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND14]], [[SHL1]]
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND11]](s32)
+ ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32)
+ ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
@@ -303,11 +302,11 @@ body: |
; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1)
; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP3]](s1)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
- ; CHECK-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
- ; CHECK-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
- ; CHECK-NEXT: [[AND19:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND16]](s32), [[AND17]](s32), [[AND18]](s32), [[AND19]](s32)
+ ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+ ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+ ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+ ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32), [[AND15]](s32)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
index 1cfde3549f28f..4b8a067ab74a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
@@ -694,9 +694,8 @@ body: |
; GFX6-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
; GFX8-LABEL: name: usubsat_s64
; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -708,9 +707,8 @@ body: |
; GFX8-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
; GFX9-LABEL: name: usubsat_s64
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
@@ -722,9 +720,8 @@ body: |
; GFX9-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV]], [[UV2]]
; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV1]], [[UV3]], [[USUBO1]]
; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
@@ -750,16 +747,14 @@ body: |
; GFX6-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV4]], [[UV6]]
; GFX6-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]]
; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV]](s64), [[UV2]]
; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV8]], [[UV10]]
; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV9]], [[UV11]], [[USUBO3]]
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
- ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV1]](s64), [[UV3]]
- ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]]
+ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]]
; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; GFX8-LABEL: name: usubsat_v2s64
@@ -774,16 +769,14 @@ body: |
; GFX8-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV4]], [[UV6]]
; GFX8-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]]
; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV]](s64), [[UV2]]
; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV8]], [[UV10]]
; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV9]], [[UV11]], [[USUBO3]]
; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
- ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV1]](s64), [[UV3]]
- ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]]
+ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]]
; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
; GFX9-LABEL: name: usubsat_v2s64
@@ -798,16 +791,14 @@ body: |
; GFX9-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UV4]], [[UV6]]
; GFX9-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s1) = G_USUBE [[UV5]], [[UV7]], [[USUBO1]]
; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO]](s32), [[USUBE]](s32)
- ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV]](s64), [[UV2]]
; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV8]], [[UV10]]
; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV9]], [[UV11]], [[USUBO3]]
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
- ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[UV1]](s64), [[UV3]]
- ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[MV1]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index 5431466c8d8c0..f8e7e5ecd6260 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -39,34 +39,31 @@ define i64 @v_usubo_i64(i64 %a, i64 %b) {
; GFX7-LABEL: v_usubo_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubo_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v4, v0
-; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
%usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%sub = extractvalue {i64, i1} %usubo, 0
@@ -479,47 +476,29 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
; GFX7-LABEL: s_usubo_i64:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX7-NEXT: s_sub_u32 s4, s0, s2
-; GFX7-NEXT: s_subb_u32 s5, s1, s3
-; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT: v_mov_b32_e32 v1, s5
-; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
-; GFX7-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s1, v1
+; GFX7-NEXT: s_sub_u32 s0, s0, s2
+; GFX7-NEXT: s_subb_u32 s1, s1, s3
+; GFX7-NEXT: s_cselect_b32 s2, 1, 0
+; GFX7-NEXT: s_sub_u32 s0, s0, s2
+; GFX7-NEXT: s_subb_u32 s1, s1, 0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubo_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: s_subb_u32 s5, s1, s3
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_sub_u32 s0, s0, s2
+; GFX8-NEXT: s_subb_u32 s1, s1, s3
+; GFX8-NEXT: s_cselect_b32 s2, 1, 0
+; GFX8-NEXT: s_sub_u32 s0, s0, s2
+; GFX8-NEXT: s_subb_u32 s1, s1, 0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: s_subb_u32 s5, s1, s3
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_sub_u32 s0, s0, s2
+; GFX9-NEXT: s_subb_u32 s1, s1, s3
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_sub_u32 s0, s0, s2
+; GFX9-NEXT: s_subb_u32 s1, s1, 0
; GFX9-NEXT: ; return to shader part epilog
%usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%sub = extractvalue {i64, i1} %usubo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 9ba5baf10af7c..0ca215bb838f3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2640,13 +2640,18 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-LABEL: v_uaddsat_i48:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v4
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_i48:
@@ -2656,7 +2661,6 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
@@ -2669,7 +2673,6 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
@@ -2683,7 +2686,6 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
@@ -2695,21 +2697,21 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-LABEL: s_uaddsat_i48:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_add_u32 s0, s0, s2
-; GFX6-NEXT: s_mov_b32 s5, 0xffff
-; GFX6-NEXT: s_addc_u32 s1, s1, s3
-; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_and_b64 s[6:7], s[0:1], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_addc_u32 s2, s1, s3
+; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT: s_cmp_lg_u32 s2, s1
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_lshr_b32 s3, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s3
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_i48:
@@ -2717,17 +2719,9 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i48:
@@ -2735,17 +2729,9 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_addc_u32 s1, s1, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_i48:
@@ -2754,12 +2740,8 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2
-; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
ret i48 %result
@@ -2768,14 +2750,19 @@ define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-LABEL: uaddsat_i48_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
@@ -2783,12 +2770,11 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: ; return to shader part epilog
;
@@ -2796,12 +2782,11 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
@@ -2809,11 +2794,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
-; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
+; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
@@ -2825,14 +2809,17 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6-LABEL: uaddsat_i48_vs:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v1, v2, vcc
-; GFX6-NEXT: s_mov_b32 s3, 0xffff
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -2845,7 +2832,6 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
@@ -2858,7 +2844,6 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
@@ -2870,7 +2855,6 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
@@ -2887,7 +2871,6 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2897,7 +2880,6 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -2907,7 +2889,6 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2918,7 +2899,6 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
@@ -2930,57 +2910,29 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_uaddsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s2
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_addc_u32 s1, s1, s3
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_addc_u32 s1, s1, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: s_addc_u32 s1, s1, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_add_u32 s0, s0, s2
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
@@ -2989,41 +2941,37 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX6-LABEL: uaddsat_i64_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: uaddsat_i64_sv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: uaddsat_i64_sv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: uaddsat_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
-; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo
+; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
%cast = bitcast i64 %result to <2 x float>
@@ -3036,7 +2984,6 @@ define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX6-NEXT: v_mov_b32_e32 v2, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: ; return to shader part epilog
@@ -3046,7 +2993,6 @@ define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: ; return to shader part epilog
@@ -3056,7 +3002,6 @@ define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -3065,7 +3010,6 @@ define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
@@ -3080,12 +3024,10 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -3095,12 +3037,10 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -3110,12 +3050,10 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -3125,11 +3063,9 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_add_co_u32 v2, s4, v2, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4
@@ -3141,11 +3077,9 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX11-NEXT: v_add_co_u32 v2, s0, v2, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v7, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, s0
@@ -3159,97 +3093,41 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-LABEL: s_uaddsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_addc_u32 s1, s1, s5
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: s_add_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: s_addc_u32 s1, s3, s7
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v4, s0
-; GFX6-NEXT: v_mov_b32_e32 v5, s1
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v2
-; GFX6-NEXT: v_readfirstlane_b32 s1, v3
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: v_readfirstlane_b32 s3, v1
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX6-NEXT: s_add_u32 s2, s2, s6
+; GFX6-NEXT: s_addc_u32 s3, s3, s7
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: s_add_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: s_addc_u32 s1, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v2
-; GFX8-NEXT: v_readfirstlane_b32 s1, v3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX8-NEXT: s_add_u32 s2, s2, s6
+; GFX8-NEXT: s_addc_u32 s3, s3, s7
+; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: s_add_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: s_addc_u32 s1, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v2
-; GFX9-NEXT: v_readfirstlane_b32 s1, v3
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX9-NEXT: s_add_u32 s2, s2, s6
+; GFX9-NEXT: s_addc_u32 s3, s3, s7
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_v2i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
; GFX10PLUS-NEXT: s_add_u32 s2, s2, s6
; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
@@ -3259,103 +3137,31 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_uaddsat_i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_addc_u32 s1, s1, s5
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_addc_u32 s2, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_addc_u32 s3, s3, s7
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s4
; GFX8-NEXT: s_addc_u32 s1, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_addc_u32 s2, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_addc_u32 s3, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s4, 1, s6
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s4
; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: s_addc_u32 s2, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_addc_u32 s3, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: s_and_b32 s4, 1, s6
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_i128:
@@ -3363,26 +3169,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10PLUS-NEXT: s_add_u32 s0, s0, s4
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s6
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s7
-; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX10PLUS-NEXT: s_and_b32 s4, 1, s8
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
ret i128 %result
@@ -3391,91 +3180,59 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX6-LABEL: uaddsat_i128_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v5, s1
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v6, s2
-; GFX6-NEXT: v_mov_b32_e32 v7, s3
-; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc
-; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, s2
+; GFX6-NEXT: v_mov_b32_e32 v5, s3
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
+; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: uaddsat_i128_sv:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: uaddsat_i128_sv:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v7, s3
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s3
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: uaddsat_i128_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0
-; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
-; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc_lo
+; GFX10PLUS-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
+; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@@ -3492,14 +3249,6 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_mov_b32_e32 v5, s3
; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3515,14 +3264,6 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3538,14 +3279,6 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3558,14 +3291,6 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
-; GFX10PLUS-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
@@ -3584,14 +3309,6 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
-; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
-; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX6-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3600,14 +3317,6 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc
; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
-; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
-; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX6-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc
@@ -3621,14 +3330,6 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3637,14 +3338,6 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
-; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX8-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc
@@ -3658,14 +3351,6 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v9, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v10, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
@@ -3674,14 +3359,6 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc
@@ -3693,31 +3370,15 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
+; GFX10-NEXT: v_add_co_u32 v4, s4, v4, v12
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s4, v5, v13, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, v6, v14, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
-; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15]
-; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v15, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, -1, s4
@@ -3731,34 +3392,18 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8
+; GFX11-NEXT: v_add_co_u32 v4, s0, v4, v12
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v5, v13, s0
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, v6, v14, s0
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
-; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15]
-; GFX11-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v9
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v7, v15, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, -1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, -1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, -1, s0
@@ -3771,293 +3416,66 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-LABEL: s_uaddsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s0, s0, s8
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: s_addc_u32 s1, s1, s9
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
; GFX6-NEXT: s_addc_u32 s2, s2, s10
-; GFX6-NEXT: v_mov_b32_e32 v0, s10
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; GFX6-NEXT: s_addc_u32 s3, s3, s11
-; GFX6-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX6-NEXT: s_add_u32 s0, s4, s12
-; GFX6-NEXT: v_mov_b32_e32 v2, s12
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: s_addc_u32 s1, s5, s13
-; GFX6-NEXT: v_mov_b32_e32 v3, s13
-; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX6-NEXT: s_addc_u32 s2, s6, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT: s_addc_u32 s3, s7, s15
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v4
-; GFX6-NEXT: v_readfirstlane_b32 s1, v5
-; GFX6-NEXT: v_readfirstlane_b32 s2, v6
-; GFX6-NEXT: v_readfirstlane_b32 s3, v7
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: v_readfirstlane_b32 s5, v1
-; GFX6-NEXT: v_readfirstlane_b32 s6, v2
-; GFX6-NEXT: v_readfirstlane_b32 s7, v3
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
+; GFX6-NEXT: s_add_u32 s4, s4, s12
+; GFX6-NEXT: s_addc_u32 s5, s5, s13
+; GFX6-NEXT: s_addc_u32 s6, s6, s14
+; GFX6-NEXT: s_addc_u32 s7, s7, s15
+; GFX6-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s0, s0, s8
; GFX8-NEXT: s_addc_u32 s1, s1, s9
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: s_addc_u32 s2, s2, s10
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_addc_u32 s3, s3, s11
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX8-NEXT: s_cselect_b32 s10, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s8, 1, s10
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_add_u32 s0, s4, s12
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX8-NEXT: s_addc_u32 s1, s5, s13
-; GFX8-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_addc_u32 s2, s6, s14
-; GFX8-NEXT: v_mov_b32_e32 v3, s13
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX8-NEXT: s_addc_u32 s3, s7, s15
-; GFX8-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, s15
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[14:15]
-; GFX8-NEXT: s_cselect_b32 s4, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s4, 1, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: v_readfirstlane_b32 s1, v5
-; GFX8-NEXT: v_readfirstlane_b32 s2, v6
-; GFX8-NEXT: v_readfirstlane_b32 s3, v7
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_readfirstlane_b32 s6, v2
-; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
+; GFX8-NEXT: s_add_u32 s4, s4, s12
+; GFX8-NEXT: s_addc_u32 s5, s5, s13
+; GFX8-NEXT: s_addc_u32 s6, s6, s14
+; GFX8-NEXT: s_addc_u32 s7, s7, s15
+; GFX8-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, s8
; GFX9-NEXT: s_addc_u32 s1, s1, s9
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: s_addc_u32 s2, s2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: s_addc_u32 s3, s3, s11
-; GFX9-NEXT: v_mov_b32_e32 v0, s10
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX9-NEXT: s_cselect_b32 s10, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: s_and_b32 s8, 1, s10
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_add_u32 s0, s4, s12
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
-; GFX9-NEXT: s_addc_u32 s1, s5, s13
-; GFX9-NEXT: v_mov_b32_e32 v2, s12
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_addc_u32 s2, s6, s14
-; GFX9-NEXT: v_mov_b32_e32 v3, s13
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
-; GFX9-NEXT: s_addc_u32 s3, s7, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s14
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[14:15]
-; GFX9-NEXT: s_cselect_b32 s4, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: s_and_b32 s4, 1, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v4
-; GFX9-NEXT: v_readfirstlane_b32 s1, v5
-; GFX9-NEXT: v_readfirstlane_b32 s2, v6
-; GFX9-NEXT: v_readfirstlane_b32 s3, v7
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: v_readfirstlane_b32 s7, v3
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_addc_u32 s5, s5, s13
+; GFX9-NEXT: s_addc_u32 s6, s6, s14
+; GFX9-NEXT: s_addc_u32 s7, s7, s15
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_uaddsat_v2i128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s0, s0, s8
-; GFX10-NEXT: s_addc_u32 s1, s1, s9
-; GFX10-NEXT: s_addc_u32 s2, s2, s10
-; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
-; GFX10-NEXT: s_addc_u32 s3, s3, s11
-; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX10-NEXT: s_cselect_b32 s16, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8
-; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8
-; GFX10-NEXT: s_and_b32 s8, 1, s16
-; GFX10-NEXT: s_add_u32 s4, s4, s12
-; GFX10-NEXT: s_addc_u32 s5, s5, s13
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
-; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13]
-; GFX10-NEXT: s_addc_u32 s6, s6, s14
-; GFX10-NEXT: s_addc_u32 s7, s7, s15
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9
-; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]
-; GFX10-NEXT: s_cselect_b32 s8, 1, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: s_and_b32 s8, 1, s8
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s4, v4
-; GFX10-NEXT: v_readfirstlane_b32 s5, v5
-; GFX10-NEXT: v_readfirstlane_b32 s6, v6
-; GFX10-NEXT: v_readfirstlane_b32 s7, v7
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_uaddsat_v2i128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_u32 s0, s0, s8
-; GFX11-NEXT: s_addc_u32 s1, s1, s9
-; GFX11-NEXT: s_addc_u32 s2, s2, s10
-; GFX11-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
-; GFX11-NEXT: s_addc_u32 s3, s3, s11
-; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX11-NEXT: s_cselect_b32 s16, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8
-; GFX11-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11]
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8
-; GFX11-NEXT: s_and_b32 s8, 1, s16
-; GFX11-NEXT: s_add_u32 s4, s4, s12
-; GFX11-NEXT: s_addc_u32 s5, s5, s13
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
-; GFX11-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13]
-; GFX11-NEXT: s_addc_u32 s6, s6, s14
-; GFX11-NEXT: s_addc_u32 s7, s7, s15
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9
-; GFX11-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]
-; GFX11-NEXT: s_cselect_b32 s8, 1, 0
-; GFX11-NEXT: s_and_b32 s8, 1, s8
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-NEXT: v_readfirstlane_b32 s5, v5
-; GFX11-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_uaddsat_v2i128:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_add_u32 s0, s0, s8
+; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9
+; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10
+; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], -1, s[0:1]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], -1, s[2:3]
+; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12
+; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13
+; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14
+; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15
+; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], -1, s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], -1, s[6:7]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 356f28f06389b..53a9c987d8fdb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2510,13 +2510,18 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-LABEL: v_usubsat_i48:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_i48:
@@ -2524,11 +2529,10 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -2537,11 +2541,10 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -2551,11 +2554,10 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
-; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
@@ -2565,71 +2567,51 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-LABEL: s_usubsat_i48:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: s_sub_u32 s4, s0, s2
-; GFX6-NEXT: s_mov_b32 s7, 0xffff
-; GFX6-NEXT: s_subb_u32 s5, s1, s3
-; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_sub_u32 s0, s0, s2
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_subb_u32 s2, s1, s3
+; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
+; GFX6-NEXT: s_cmp_lg_u32 s2, s1
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_lshr_b32 s3, s0, 16
+; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s3
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_i48:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_subb_u32 s5, s1, s3
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
-; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX8-NEXT: s_sub_u32 s0, s0, s2
+; GFX8-NEXT: s_subb_u32 s1, s1, s3
+; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i48:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_subb_u32 s5, s1, s3
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX9-NEXT: s_sub_u32 s0, s0, s2
+; GFX9-NEXT: s_subb_u32 s1, s1, s3
+; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_i48:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2
; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0
-; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
ret i48 %result
@@ -2638,17 +2620,19 @@ define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-LABEL: usubsat_i48_sv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s3, 0xffff
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
-; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i48_sv:
@@ -2656,11 +2640,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: ; return to shader part epilog
;
@@ -2669,11 +2652,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
@@ -2681,11 +2663,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
-; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
@@ -2697,17 +2678,19 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6-LABEL: usubsat_i48_vs:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0
-; GFX6-NEXT: s_mov_b32 s3, 0xffff
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
+; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v2, s1
+; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i48_vs:
@@ -2715,11 +2698,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX8-NEXT: ; return to shader part epilog
;
@@ -2728,11 +2710,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
@@ -2740,11 +2721,10 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
-; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
@@ -2757,42 +2737,38 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-LABEL: v_usubsat_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubsat_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_usubsat_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10PLUS-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc_lo
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
@@ -2801,58 +2777,30 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-LABEL: s_usubsat_i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_sub_u32 s4, s0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: s_subb_u32 s5, s1, s3
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_sub_u32 s0, s0, s2
+; GFX6-NEXT: s_subb_u32 s1, s1, s3
+; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_subb_u32 s5, s1, s3
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_sub_u32 s0, s0, s2
+; GFX8-NEXT: s_subb_u32 s1, s1, s3
+; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_sub_u32 s4, s0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_subb_u32 s5, s1, s3
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_sub_u32 s0, s0, s2
+; GFX9-NEXT: s_subb_u32 s1, s1, s3
+; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_i64:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2
; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
@@ -2862,40 +2810,36 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX6-LABEL: usubsat_i64_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i64_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: usubsat_i64_sv:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: usubsat_i64_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
-; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
%cast = bitcast i64 %result to <2 x float>
@@ -2906,40 +2850,36 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX6-LABEL: usubsat_i64_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i64_vs:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v1, v2, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: usubsat_i64_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: usubsat_i64_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
-; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
+; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
%cast = bitcast i64 %result to <2 x float>
@@ -2950,78 +2890,68 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX6-LABEL: v_usubsat_v2i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
-; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6
-; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4
-; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubsat_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_usubsat_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX10-NEXT: v_sub_co_u32 v4, s4, v2, v6
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v3, v7, s4
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0, s4
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_sub_co_u32 v2, s4, v2, v6
+; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_usubsat_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11-NEXT: v_sub_co_u32 v4, s0, v2, v6
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, s0, v3, v7, s0
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0, s0
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4
+; GFX11-NEXT: v_sub_co_u32 v2, s0, v2, v6
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
@@ -3030,98 +2960,42 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i64:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: s_subb_u32 s9, s1, s5
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_sub_u32 s0, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX6-NEXT: s_subb_u32 s1, s3, s7
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v4, s0
-; GFX6-NEXT: v_mov_b32_e32 v5, s1
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v2
-; GFX6-NEXT: v_readfirstlane_b32 s1, v3
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: v_readfirstlane_b32 s3, v1
+; GFX6-NEXT: s_sub_u32 s0, s0, s4
+; GFX6-NEXT: s_subb_u32 s1, s1, s5
+; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT: s_sub_u32 s2, s2, s6
+; GFX6-NEXT: s_subb_u32 s3, s3, s7
+; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_subb_u32 s9, s1, s5
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_sub_u32 s0, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX8-NEXT: s_subb_u32 s1, s3, s7
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v2
-; GFX8-NEXT: v_readfirstlane_b32 s1, v3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: s_sub_u32 s0, s0, s4
+; GFX8-NEXT: s_subb_u32 s1, s1, s5
+; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT: s_sub_u32 s2, s2, s6
+; GFX8-NEXT: s_subb_u32 s3, s3, s7
+; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s9, s1, s5
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_sub_u32 s0, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX9-NEXT: s_subb_u32 s1, s3, s7
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, 0, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v2
-; GFX9-NEXT: v_readfirstlane_b32 s1, v3
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: s_sub_u32 s0, s0, s4
+; GFX9-NEXT: s_subb_u32 s1, s1, s5
+; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT: s_sub_u32 s2, s2, s6
+; GFX9-NEXT: s_subb_u32 s3, s3, s7
+; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_v2i64:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4
; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5
-; GFX10PLUS-NEXT: s_sub_u32 s0, s2, s6
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, 0, s4
-; GFX10PLUS-NEXT: s_subb_u32 s1, s3, s7
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s8, 0, s4
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s0, 0, s2
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s1, 0, s2
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT: s_sub_u32 s2, s2, s6
+; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
@@ -3130,131 +3004,42 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-LABEL: s_usubsat_i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v1, s7
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_sub_u32 s8, s0, s4
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_subb_u32 s9, s1, s5
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: s_subb_u32 s10, s2, s6
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_subb_u32 s11, s3, s7
-; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s10
-; GFX6-NEXT: v_mov_b32_e32 v3, s11
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: s_sub_u32 s0, s0, s4
+; GFX6-NEXT: s_subb_u32 s1, s1, s5
+; GFX6-NEXT: s_subb_u32 s2, s2, s6
+; GFX6-NEXT: s_subb_u32 s3, s3, s7
+; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: s_subb_u32 s9, s1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_subb_u32 s10, s2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: s_subb_u32 s11, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s6
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s8
-; GFX8-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: s_sub_u32 s0, s0, s4
+; GFX8-NEXT: s_subb_u32 s1, s1, s5
+; GFX8-NEXT: s_subb_u32 s2, s2, s6
+; GFX8-NEXT: s_subb_u32 s3, s3, s7
+; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: s_subb_u32 s9, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_subb_u32 s10, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: s_subb_u32 s11, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s6
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s11
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: s_sub_u32 s0, s0, s4
+; GFX9-NEXT: s_subb_u32 s1, s1, s5
+; GFX9-NEXT: s_subb_u32 s2, s2, s6
+; GFX9-NEXT: s_subb_u32 s3, s3, s7
+; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_i128:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5]
-; GFX10PLUS-NEXT: s_subb_u32 s9, s1, s5
-; GFX10PLUS-NEXT: s_subb_u32 s10, s2, s6
-; GFX10PLUS-NEXT: s_subb_u32 s11, s3, s7
-; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[6:7]
-; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10PLUS-NEXT: s_and_b32 s0, 1, s12
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s11, 0, vcc_lo
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4
+; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5
+; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s6
+; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
ret i128 %result
@@ -3264,90 +3049,58 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX6-LABEL: usubsat_i128_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v4, s1
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v6, s2
-; GFX6-NEXT: v_mov_b32_e32 v7, s3
-; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
-; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, s2
+; GFX6-NEXT: v_mov_b32_e32 v5, s3
+; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc
+; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i128_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v4, s1
-; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: usubsat_i128_sv:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v7, s3
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s3
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v2, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: usubsat_i128_sv:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v4
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@@ -3358,90 +3111,58 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-LABEL: usubsat_i128_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v4, s1
-; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
-; GFX6-NEXT: v_mov_b32_e32 v6, s2
-; GFX6-NEXT: v_mov_b32_e32 v7, s3
-; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
-; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX6-NEXT: v_mov_b32_e32 v4, s2
+; GFX6-NEXT: v_mov_b32_e32 v5, s3
+; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc
+; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i128_vs:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v4, s1
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: usubsat_i128_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s0, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v4, vcc
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v7, s3
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v6, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s3
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: usubsat_i128_vs:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
-; GFX10PLUS-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v4
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@@ -3452,188 +3173,108 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-LABEL: v_usubsat_v2i128:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v0, v8
-; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc
-; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc
-; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc
-; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12
-; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc
-; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc
-; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc
+; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12
+; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc
+; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc
+; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v0, v8
-; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc
-; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc
-; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12
-; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc
-; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc
-; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v12
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_usubsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v0, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v1, v9, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, 0, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v12
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v13, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v14, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_usubsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[6:7], v[14:15]
-; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX10-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8
+; GFX10-NEXT: v_sub_co_u32 v4, s4, v4, v12
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12
-; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v8
-; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s5
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_usubsat_v2i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s1, v[6:7], v[14:15]
-; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX11-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
-; GFX11-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
-; GFX11-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX11-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v16
-; GFX11-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8
+; GFX11-NEXT: v_sub_co_u32 v4, s0, v4, v12
; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v18, v17, s1
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0
; GFX11-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0
; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v4, v12
-; GFX11-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v8
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, 0, s1
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, 0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
@@ -3642,294 +3283,67 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
; GFX6-LABEL: s_usubsat_v2i128:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: v_mov_b32_e32 v0, s10
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v1, s11
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_sub_u32 s16, s0, s8
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
-; GFX6-NEXT: s_subb_u32 s17, s1, s9
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_subb_u32 s18, s2, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s17
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: s_subb_u32 s19, s3, s11
-; GFX6-NEXT: v_mov_b32_e32 v1, s16
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s12
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: v_mov_b32_e32 v1, s19
-; GFX6-NEXT: v_mov_b32_e32 v3, s13
-; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT: s_sub_u32 s0, s4, s12
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
-; GFX6-NEXT: s_subb_u32 s1, s5, s13
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX6-NEXT: s_subb_u32 s2, s6, s14
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_subb_u32 s3, s7, s15
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v4
-; GFX6-NEXT: v_readfirstlane_b32 s1, v5
-; GFX6-NEXT: v_readfirstlane_b32 s2, v6
-; GFX6-NEXT: v_readfirstlane_b32 s3, v7
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: v_readfirstlane_b32 s5, v1
-; GFX6-NEXT: v_readfirstlane_b32 s6, v2
-; GFX6-NEXT: v_readfirstlane_b32 s7, v3
+; GFX6-NEXT: s_sub_u32 s0, s0, s8
+; GFX6-NEXT: s_subb_u32 s1, s1, s9
+; GFX6-NEXT: s_subb_u32 s2, s2, s10
+; GFX6-NEXT: s_subb_u32 s3, s3, s11
+; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX6-NEXT: s_sub_u32 s4, s4, s12
+; GFX6-NEXT: s_subb_u32 s5, s5, s13
+; GFX6-NEXT: s_subb_u32 s6, s6, s14
+; GFX6-NEXT: s_subb_u32 s7, s7, s15
+; GFX6-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v2i128:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sub_u32 s16, s0, s8
-; GFX8-NEXT: s_subb_u32 s17, s1, s9
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: s_subb_u32 s18, s2, s10
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_subb_u32 s19, s3, s11
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX8-NEXT: s_cselect_b32 s10, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s10
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, s17
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: s_sub_u32 s0, s4, s12
-; GFX8-NEXT: v_mov_b32_e32 v1, s16
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX8-NEXT: s_subb_u32 s1, s5, s13
-; GFX8-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, s18
-; GFX8-NEXT: v_mov_b32_e32 v1, s19
-; GFX8-NEXT: s_subb_u32 s2, s6, s14
-; GFX8-NEXT: v_mov_b32_e32 v3, s13
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX8-NEXT: s_subb_u32 s3, s7, s15
-; GFX8-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v1, s15
-; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX8-NEXT: s_cselect_b32 s8, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX8-NEXT: s_and_b32 s4, 1, s8
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: v_readfirstlane_b32 s1, v5
-; GFX8-NEXT: v_readfirstlane_b32 s2, v6
-; GFX8-NEXT: v_readfirstlane_b32 s3, v7
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_readfirstlane_b32 s6, v2
-; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: s_sub_u32 s0, s0, s8
+; GFX8-NEXT: s_subb_u32 s1, s1, s9
+; GFX8-NEXT: s_subb_u32 s2, s2, s10
+; GFX8-NEXT: s_subb_u32 s3, s3, s11
+; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX8-NEXT: s_sub_u32 s4, s4, s12
+; GFX8-NEXT: s_subb_u32 s5, s5, s13
+; GFX8-NEXT: s_subb_u32 s6, s6, s14
+; GFX8-NEXT: s_subb_u32 s7, s7, s15
+; GFX8-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v2i128:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sub_u32 s16, s0, s8
-; GFX9-NEXT: s_subb_u32 s17, s1, s9
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: s_subb_u32 s18, s2, s10
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_subb_u32 s19, s3, s11
-; GFX9-NEXT: v_mov_b32_e32 v0, s10
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX9-NEXT: s_cselect_b32 s10, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s10
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, s17
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_sub_u32 s0, s4, s12
-; GFX9-NEXT: v_mov_b32_e32 v1, s16
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
-; GFX9-NEXT: s_subb_u32 s1, s5, s13
-; GFX9-NEXT: v_mov_b32_e32 v2, s12
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s18
-; GFX9-NEXT: v_mov_b32_e32 v1, s19
-; GFX9-NEXT: s_subb_u32 s2, s6, s14
-; GFX9-NEXT: v_mov_b32_e32 v3, s13
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
-; GFX9-NEXT: s_subb_u32 s3, s7, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s14
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX9-NEXT: s_cselect_b32 s8, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: s_and_b32 s4, 1, s8
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v4
-; GFX9-NEXT: v_readfirstlane_b32 s1, v5
-; GFX9-NEXT: v_readfirstlane_b32 s2, v6
-; GFX9-NEXT: v_readfirstlane_b32 s3, v7
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v2
-; GFX9-NEXT: v_readfirstlane_b32 s7, v3
+; GFX9-NEXT: s_sub_u32 s0, s0, s8
+; GFX9-NEXT: s_subb_u32 s1, s1, s9
+; GFX9-NEXT: s_subb_u32 s2, s2, s10
+; GFX9-NEXT: s_subb_u32 s3, s3, s11
+; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX9-NEXT: s_sub_u32 s4, s4, s12
+; GFX9-NEXT: s_subb_u32 s5, s5, s13
+; GFX9-NEXT: s_subb_u32 s6, s6, s14
+; GFX9-NEXT: s_subb_u32 s7, s7, s15
+; GFX9-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_usubsat_v2i128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sub_u32 s16, s0, s8
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
-; GFX10-NEXT: s_subb_u32 s17, s1, s9
-; GFX10-NEXT: s_subb_u32 s18, s2, s10
-; GFX10-NEXT: s_subb_u32 s19, s3, s11
-; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11]
-; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s20
-; GFX10-NEXT: s_sub_u32 s2, s4, s12
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13]
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_subb_u32 s1, s5, s13
-; GFX10-NEXT: s_subb_u32 s8, s6, s14
-; GFX10-NEXT: s_subb_u32 s3, s7, s15
-; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15]
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: s_and_b32 s0, 1, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: v_readfirstlane_b32 s4, v4
-; GFX10-NEXT: v_readfirstlane_b32 s5, v5
-; GFX10-NEXT: v_readfirstlane_b32 s6, v6
-; GFX10-NEXT: v_readfirstlane_b32 s7, v7
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_usubsat_v2i128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_sub_u32 s16, s0, s8
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
-; GFX11-NEXT: s_subb_u32 s17, s1, s9
-; GFX11-NEXT: s_subb_u32 s18, s2, s10
-; GFX11-NEXT: s_subb_u32 s19, s3, s11
-; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11]
-; GFX11-NEXT: s_cselect_b32 s20, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s20
-; GFX11-NEXT: s_sub_u32 s2, s4, s12
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13]
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: s_subb_u32 s1, s5, s13
-; GFX11-NEXT: s_subb_u32 s8, s6, s14
-; GFX11-NEXT: s_subb_u32 s3, s7, s15
-; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15]
-; GFX11-NEXT: s_cselect_b32 s0, 1, 0
-; GFX11-NEXT: s_and_b32 s0, 1, s0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: v_readfirstlane_b32 s4, v4
-; GFX11-NEXT: v_readfirstlane_b32 s5, v5
-; GFX11-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_usubsat_v2i128:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s8
+; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s9
+; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s10
+; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s11
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3]
+; GFX10PLUS-NEXT: s_sub_u32 s4, s4, s12
+; GFX10PLUS-NEXT: s_subb_u32 s5, s5, s13
+; GFX10PLUS-NEXT: s_subb_u32 s6, s6, s14
+; GFX10PLUS-NEXT: s_subb_u32 s7, s7, s15
+; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], 0, s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], 0, s[6:7]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
}
More information about the llvm-commits
mailing list