[llvm-branch-commits] [llvm] [AMDGPU][GlobalISel] Add RegBankLegalize rules for [us]addsat/[us]subsat (PR #176255)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 15 14:12:58 PST 2026
https://github.com/vangthao95 updated https://github.com/llvm/llvm-project/pull/176255
>From 9d04ddcc3f005c5e8fca1b9398d36a063e78e4b3 Mon Sep 17 00:00:00 2001
From: Vang Thao <vang.thao at amd.com>
Date: Thu, 15 Jan 2026 13:58:22 -0800
Subject: [PATCH] [AMDGPU][GlobalISel] Add RegBankLegalize rules for
[us]addsat/[us]subsat
---
.../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 8 +
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 1835 +++++++---------
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 1917 +++++++----------
.../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 551 +++--
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 551 +++--
5 files changed, 2287 insertions(+), 2575 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index aacfa14975fa2..ce80a94f29222 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -487,6 +487,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32, Sgpr32AExtBoolInReg}})
.Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
+ addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
addRulesForGOpcs({G_MUL}, Standard).Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
bool hasMulHi = ST->hasScalarMulHiInsts();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 2f956d7a0a534..f6e36241a05dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_saddsat_i7:
@@ -98,8 +98,9 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 9
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_saddsat_i7:
@@ -107,8 +108,9 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@@ -207,8 +209,9 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_saddsat_i8:
@@ -216,8 +219,9 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@@ -436,58 +440,48 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s1, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_sext_i32_i16 s2, 0x80008
+; GFX9-NEXT: s_ashr_i32 s1, s1, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX9-NEXT: s_lshr_b32 s1, s0, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_v2i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s2, s0, 8
-; GFX10-NEXT: s_lshr_b32 s3, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: s_lshr_b32 s2, s0, 16
-; GFX10-NEXT: s_lshr_b32 s3, s1, 16
-; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s2, s2, 8
-; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s1, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: v_pk_add_i16 v0, s0, s1 clamp
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_v2i8:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16
+; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10PLUS-NEXT: v_pk_add_i16 v0, s0, s1 clamp
+; GFX10PLUS-NEXT: s_sext_i32_i16 s1, 0x80008
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10PLUS-NEXT: s_ashr_i32 s1, s2, s1
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -886,66 +880,89 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_pk_add_i16 v1, s2, v1 clamp
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_i16 v0, s2, v0 clamp
+; GFX9-NEXT: s_sext_i32_i16 s2, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_sext_i32_i16 s3, 0x80008
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX9-NEXT: s_sext_i32_i16 s2, s1
+; GFX9-NEXT: s_ashr_i32 s1, s1, 16
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s1, s1, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX9-NEXT: s_and_b32 s2, s0, 0xff
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT: s_lshl_b32 s0, s0, 8
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, 24
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_saddsat_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
-; GFX10-NEXT: s_lshr_b32 s5, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s2, 16
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-NEXT: s_lshr_b32 s4, s1, 16
-; GFX10-NEXT: s_lshr_b32 s5, s3, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s5, s4, 16
+; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_lshl_b32 s4, s4, 0x80008
+; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp
-; GFX10-NEXT: v_mov_b32_e32 v2, 8
-; GFX10-NEXT: v_mov_b32_e32 v4, 24
-; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s3
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5
+; GFX10-NEXT: s_sext_i32_i16 s3, 0x80008
+; GFX10-NEXT: v_pk_add_i16 v1, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: s_sext_i32_i16 s2, s0
+; GFX10-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10-NEXT: s_ashr_i32 s2, s2, s3
+; GFX10-NEXT: s_ashr_i32 s0, s0, 8
+; GFX10-NEXT: s_sext_i32_i16 s4, s1
+; GFX10-NEXT: s_ashr_i32 s1, s1, 16
+; GFX10-NEXT: s_ashr_i32 s3, s4, s3
+; GFX10-NEXT: s_ashr_i32 s1, s1, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 16
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 24
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_saddsat_v4i8:
@@ -965,28 +982,40 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp
; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-NEXT: v_pk_add_i16 v0, s2, s3 clamp
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX11-NEXT: s_sext_i32_i16 s3, 0x80008
; GFX11-NEXT: v_pk_add_i16 v1, s0, s1 clamp
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: s_sext_i32_i16 s2, s0
+; GFX11-NEXT: s_ashr_i32 s0, s0, 16
+; GFX11-NEXT: s_ashr_i32 s2, s2, s3
+; GFX11-NEXT: s_ashr_i32 s0, s0, 8
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: s_ashr_i32 s1, s1, 16
+; GFX11-NEXT: s_ashr_i32 s3, s4, s3
+; GFX11-NEXT: s_ashr_i32 s1, s1, 8
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-NEXT: s_and_b32 s3, s1, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s2, s3, 16
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1085,8 +1114,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_saddsat_i24:
@@ -1094,8 +1123,8 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_i32 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
@@ -4090,9 +4119,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
-; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
@@ -4108,9 +4137,9 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v1, v3, vcc
-; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
@@ -4178,51 +4207,49 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-LABEL: s_saddsat_i48:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_add_u32 s4, s0, s2
-; GFX6-NEXT: s_addc_u32 s3, s1, s3
+; GFX6-NEXT: s_addc_u32 s5, s1, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
-; GFX6-NEXT: s_ashr_i32 s2, s7, 31
-; GFX6-NEXT: s_ashr_i32 s5, s7, 15
-; GFX6-NEXT: s_addk_i32 s2, 0x8000
-; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_xor_b32 s2, s0, s3
+; GFX6-NEXT: s_ashr_i32 s1, s7, 31
+; GFX6-NEXT: s_ashr_i32 s0, s7, 15
+; GFX6-NEXT: s_addk_i32 s1, 0x8000
+; GFX6-NEXT: s_and_b32 s2, s2, 1
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_saddsat_i48:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s2
-; GFX8-NEXT: s_addc_u32 s3, s1, s3
+; GFX8-NEXT: s_addc_u32 s5, s1, s3
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
-; GFX8-NEXT: s_ashr_i32 s2, s7, 31
-; GFX8-NEXT: s_ashr_i32 s5, s7, 15
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_xor_b32 s2, s0, s3
+; GFX8-NEXT: s_ashr_i32 s1, s7, 31
+; GFX8-NEXT: s_ashr_i32 s0, s7, 15
+; GFX8-NEXT: s_addk_i32 s1, 0x8000
+; GFX8-NEXT: s_and_b32 s2, s2, 1
+; GFX8-NEXT: s_cmp_lg_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_saddsat_i48:
@@ -4235,58 +4262,39 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_xor_b32 s2, s0, s6
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s2, s2, 1
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 16
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_i48:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GFX10-NEXT: s_add_u32 s4, s0, s2
-; GFX10-NEXT: s_addc_u32 s5, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX10-NEXT: s_xor_b32 s0, s1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_i48:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GFX11-NEXT: s_add_u32 s4, s0, s2
-; GFX11-NEXT: s_addc_u32 s5, s1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX11-NEXT: s_xor_b32 s0, s1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_i48:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX10PLUS-NEXT: s_add_u32 s4, s0, s2
+; GFX10PLUS-NEXT: s_addc_u32 s5, s1, s3
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s1, s1, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10PLUS-NEXT: s_and_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_ashr_i64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
ret i48 %result
}
@@ -4297,9 +4305,9 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v4, 0, 16
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
@@ -4316,9 +4324,9 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v3, v1, vcc
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 16
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
@@ -4391,16 +4399,18 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
+; GFX6-NEXT: v_bfe_i32 v3, v4, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -4410,16 +4420,18 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
+; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -4431,9 +4443,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], exec, 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
@@ -4452,6 +4466,8 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -4469,6 +4485,8 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
@@ -4562,18 +4580,17 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-NEXT: s_addc_u32 s5, s1, s3
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX6-NEXT: s_ashr_i32 s2, s5, 31
-; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_cselect_b32 s6, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_xor_b32 s2, s0, s6
+; GFX6-NEXT: s_ashr_i32 s0, s5, 31
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s2, s2, 1
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_saddsat_i64:
@@ -4584,17 +4601,16 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX8-NEXT: s_ashr_i32 s2, s5, 31
-; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s6, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_xor_b32 s2, s0, s6
+; GFX8-NEXT: s_ashr_i32 s0, s5, 31
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s2, s2, 1
+; GFX8-NEXT: s_cmp_lg_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_saddsat_i64:
@@ -4605,51 +4621,35 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_xor_b32 s2, s0, s6
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s2, s2, 1
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s4, s0, s2
-; GFX10-NEXT: s_addc_u32 s5, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX10-NEXT: s_xor_b32 s0, s1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_u32 s4, s0, s2
-; GFX11-NEXT: s_addc_u32 s5, s1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX11-NEXT: s_xor_b32 s0, s1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_i64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_add_u32 s4, s0, s2
+; GFX10PLUS-NEXT: s_addc_u32 s5, s1, s3
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s1, s1, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10PLUS-NEXT: s_and_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
}
@@ -4731,12 +4731,14 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX6-NEXT: ; return to shader part epilog
@@ -4745,12 +4747,14 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-NEXT: ; return to shader part epilog
@@ -4759,9 +4763,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], exec, 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
@@ -4776,6 +4782,8 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -4789,6 +4797,8 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
@@ -4925,35 +4935,33 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: s_addc_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
-; GFX6-NEXT: s_ashr_i32 s4, s9, 31
-; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX6-NEXT: s_add_u32 s0, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_addc_u32 s1, s3, s7
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
+; GFX6-NEXT: s_cselect_b32 s10, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_xor_b32 s4, s0, s10
+; GFX6-NEXT: s_ashr_i32 s0, s9, 31
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s4, s4, 1
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: s_add_u32 s4, s2, s6
+; GFX6-NEXT: s_addc_u32 s5, s3, s7
; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX6-NEXT: s_or_b64 s[2:3], vcc, vcc
; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
-; GFX6-NEXT: s_ashr_i32 s4, s1, 31
-; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v4, s0
-; GFX6-NEXT: v_mov_b32_e32 v5, s1
-; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v2
-; GFX6-NEXT: v_readfirstlane_b32 s1, v3
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: v_readfirstlane_b32 s3, v1
+; GFX6-NEXT: s_cselect_b32 s8, 1, 0
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3]
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_xor_b32 s6, s2, s8
+; GFX6-NEXT: s_ashr_i32 s2, s5, 31
+; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX6-NEXT: s_and_b32 s6, s6, 1
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_saddsat_v2i64:
@@ -4964,34 +4972,32 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
-; GFX8-NEXT: s_ashr_i32 s4, s9, 31
-; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT: s_add_u32 s0, s2, s6
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s10, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_xor_b32 s4, s0, s10
+; GFX8-NEXT: s_ashr_i32 s0, s9, 31
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s4, s4, 1
+; GFX8-NEXT: s_cmp_lg_u32 s4, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: s_add_u32 s4, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_addc_u32 s1, s3, s7
+; GFX8-NEXT: s_addc_u32 s5, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
-; GFX8-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v2
-; GFX8-NEXT: v_readfirstlane_b32 s1, v3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s8, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_cselect_b32 s2, 1, 0
+; GFX8-NEXT: s_xor_b32 s6, s2, s8
+; GFX8-NEXT: s_ashr_i32 s2, s5, 31
+; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX8-NEXT: s_and_b32 s6, s6, 1
+; GFX8-NEXT: s_cmp_lg_u32 s6, 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_saddsat_v2i64:
@@ -5002,93 +5008,65 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
-; GFX9-NEXT: s_ashr_i32 s4, s9, 31
-; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX9-NEXT: s_add_u32 s0, s2, s6
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_xor_b32 s4, s0, s10
+; GFX9-NEXT: s_ashr_i32 s0, s9, 31
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s4, s4, 1
+; GFX9-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT: s_add_u32 s4, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_addc_u32 s1, s3, s7
+; GFX9-NEXT: s_addc_u32 s5, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
-; GFX9-NEXT: s_ashr_i32 s4, s1, 31
-; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v2
-; GFX9-NEXT: v_readfirstlane_b32 s1, v3
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_xor_b32 s6, s2, s8
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX9-NEXT: s_and_b32 s6, s6, 1
+; GFX9-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_v2i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s8, s0, s4
-; GFX10-NEXT: s_addc_u32 s9, s1, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
-; GFX10-NEXT: s_ashr_i32 s4, s9, 31
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_add_i32 s8, s4, 0x80000000
-; GFX10-NEXT: s_xor_b32 s5, s1, s0
-; GFX10-NEXT: s_add_u32 s0, s2, s6
-; GFX10-NEXT: s_addc_u32 s1, s3, s7
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5
-; GFX10-NEXT: s_ashr_i32 s4, s1, 31
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, s5
-; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX10-NEXT: s_xor_b32 s1, s3, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_v2i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_u32 s8, s0, s4
-; GFX11-NEXT: s_addc_u32 s9, s1, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
-; GFX11-NEXT: s_ashr_i32 s4, s9, 31
-; GFX11-NEXT: s_add_i32 s8, s4, 0x80000000
-; GFX11-NEXT: s_xor_b32 s5, s1, s0
-; GFX11-NEXT: s_add_u32 s0, s2, s6
-; GFX11-NEXT: s_addc_u32 s1, s3, s7
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5
-; GFX11-NEXT: s_ashr_i32 s4, s1, 31
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, s5
-; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX11-NEXT: s_xor_b32 s1, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_v2i64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_add_u32 s8, s0, s4
+; GFX10PLUS-NEXT: s_addc_u32 s9, s1, s5
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s1, s1, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31
+; GFX10PLUS-NEXT: s_and_b32 s4, s1, 1
+; GFX10PLUS-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX10PLUS-NEXT: s_add_u32 s4, s2, s6
+; GFX10PLUS-NEXT: s_addc_u32 s5, s3, s7
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s3, s3, s2
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 31
+; GFX10PLUS-NEXT: s_and_b32 s6, s3, 1
+; GFX10PLUS-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s6, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
}
@@ -5101,214 +5079,138 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: s_addc_u32 s5, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_addc_u32 s8, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_addc_u32 s9, s3, s7
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s10, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s3, s0, 1
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_cselect_b32 s2, s10, s2
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s0, s0, 1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_cselect_b32 s0, 0, s3
+; GFX6-NEXT: s_xor_b32 s6, s0, s2
; GFX6-NEXT: s_ashr_i32 s0, s9, 31
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_mov_b32_e32 v4, s8
-; GFX6-NEXT: v_mov_b32_e32 v5, s9
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s2, v1
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s6, s6, 1
+; GFX6-NEXT: s_mov_b32 s1, s0
+; GFX6-NEXT: s_mov_b32 s2, s0
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_saddsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s4, s0, s4
-; GFX8-NEXT: s_addc_u32 s5, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_addc_u32 s8, s2, s6
+; GFX8-NEXT: s_addc_u32 s5, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s9, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_addc_u32 s8, s2, s6
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s9, s3, s7
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: s_cselect_b32 s10, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
+; GFX8-NEXT: s_cselect_b32 s11, 1, 0
; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
+; GFX8-NEXT: s_cselect_b32 s2, s10, s11
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
-; GFX8-NEXT: s_and_b32 s0, 1, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s2
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_cselect_b32 s0, 0, s0
+; GFX8-NEXT: s_xor_b32 s6, s0, s2
; GFX8-NEXT: s_ashr_i32 s0, s9, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v2
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s6, s6, 1
+; GFX8-NEXT: s_mov_b32 s1, s0
+; GFX8-NEXT: s_mov_b32 s2, s0
+; GFX8-NEXT: s_cmp_lg_u32 s6, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_saddsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s4, s0, s4
-; GFX9-NEXT: s_addc_u32 s5, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_addc_u32 s8, s2, s6
+; GFX9-NEXT: s_addc_u32 s5, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_addc_u32 s9, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_addc_u32 s8, s2, s6
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_addc_u32 s9, s3, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
+; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s2, s10, s11
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[2:3]
-; GFX9-NEXT: s_and_b32 s0, 1, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s2
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_cselect_b32 s0, 0, s0
+; GFX9-NEXT: s_xor_b32 s6, s0, s2
; GFX9-NEXT: s_ashr_i32 s0, s9, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v2
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s6, s6, 1
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_i128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s4, s0, s4
-; GFX10-NEXT: s_addc_u32 s5, s1, s5
-; GFX10-NEXT: s_addc_u32 s8, s2, s6
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT: s_addc_u32 s9, s3, s7
-; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX10-NEXT: v_mov_b32_e32 v3, s9
-; GFX10-NEXT: s_cselect_b32 s10, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s10
-; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX10-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-NEXT: s_ashr_i32 s0, s9, 31
-; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s2, v0
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_i128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_u32 s4, s0, s4
-; GFX11-NEXT: s_addc_u32 s5, s1, s5
-; GFX11-NEXT: s_addc_u32 s8, s2, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT: s_addc_u32 s9, s3, s7
-; GFX11-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
-; GFX11-NEXT: v_mov_b32_e32 v3, s9
-; GFX11-NEXT: s_cselect_b32 s10, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s10
-; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT: v_mov_b32_e32 v2, s5
-; GFX11-NEXT: s_ashr_i32 s0, s9, 31
-; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s1, v2
-; GFX11-NEXT: v_readfirstlane_b32 s2, v0
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_i128:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_add_u32 s4, s0, s4
+; GFX10PLUS-NEXT: s_addc_u32 s5, s1, s5
+; GFX10PLUS-NEXT: s_addc_u32 s8, s2, s6
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
+; GFX10PLUS-NEXT: s_addc_u32 s9, s3, s7
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s10, s[6:7], 0
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[2:3]
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
+; GFX10PLUS-NEXT: s_cselect_b32 s1, s0, s1
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s10, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[6:7], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, 0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31
+; GFX10PLUS-NEXT: s_xor_b32 s2, s2, s1
+; GFX10PLUS-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX10PLUS-NEXT: s_and_b32 s6, s2, 1
+; GFX10PLUS-NEXT: s_mov_b32 s1, s0
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s6, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
ret i128 %result
}
@@ -5335,9 +5237,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX6-NEXT: v_bfrev_b32_e32 v6, 1
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, v3, v6
; GFX6-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000000, v3
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -5366,10 +5267,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfrev_b32_e32 v6, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v3, v6
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000000, v3
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
@@ -5397,9 +5296,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
@@ -5425,8 +5323,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6
; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo
@@ -5452,8 +5349,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_cndmask_b32 v3, v5, v6
@@ -5470,24 +5366,28 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0
; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc
; GFX6-NEXT: v_mov_b32_e32 v6, s2
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_mov_b32_e32 v7, s3
; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc
; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX6-NEXT: s_and_b32 s0, s0, 1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
+; GFX6-NEXT: s_cselect_b32 s0, 0, s4
+; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5505,24 +5405,21 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: s_cselect_b32 s4, 1, 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s4
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_cselect_b32 s0, 0, s0
+; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
@@ -5539,23 +5436,21 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v6, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s4
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX9-NEXT: s_cselect_b32 s0, 0, s0
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
@@ -5569,23 +5464,21 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT: s_cselect_b32 s0, 1, 0
-; GFX10-NEXT: s_and_b32 s0, 1, s0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX10-NEXT: s_cselect_b32 s0, 0, s0
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
+; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
@@ -5599,22 +5492,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, s3, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT: s_cselect_b32 s0, 1, 0
-; GFX11-NEXT: s_and_b32 s0, 1, s0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[2:3], 0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, 1, 0
+; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX11-NEXT: s_cselect_b32 s0, 0, s0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
-; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
@@ -5645,8 +5536,8 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc
@@ -5668,8 +5559,8 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4
-; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6
; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
@@ -5698,8 +5589,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
@@ -5721,8 +5611,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
@@ -5749,8 +5638,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
@@ -5772,8 +5660,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
@@ -5805,7 +5692,6 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
@@ -5813,17 +5699,16 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17
; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17
-; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v3
+; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u16_e64 s4, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4
@@ -5863,17 +5748,16 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17
; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
+; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v3
+; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_cndmask_b32 v3, v17, v4
; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0
@@ -5891,411 +5775,262 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_addc_u32 s9, s1, s9
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_addc_u32 s16, s2, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_addc_u32 s17, s3, s11
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s18, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s3, s0, 1
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_cselect_b32 s2, s18, s2
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s0, s0, 1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_cselect_b32 s0, 0, s3
+; GFX6-NEXT: s_xor_b32 s10, s0, s2
; GFX6-NEXT: s_ashr_i32 s0, s17, 31
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_mov_b32_e32 v2, s16
-; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX6-NEXT: s_add_u32 s0, s4, s12
+; GFX6-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s10, s10, 1
+; GFX6-NEXT: s_mov_b32 s1, s0
+; GFX6-NEXT: s_mov_b32 s2, s0
+; GFX6-NEXT: s_cmp_lg_u32 s10, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX6-NEXT: s_add_u32 s8, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: s_addc_u32 s1, s5, s13
+; GFX6-NEXT: s_addc_u32 s9, s5, s13
; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: s_addc_u32 s2, s6, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: s_addc_u32 s3, s7, s15
-; GFX6-NEXT: v_mov_b32_e32 v3, s7
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX6-NEXT: s_addc_u32 s10, s6, s14
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NEXT: s_addc_u32 s11, s7, s15
+; GFX6-NEXT: v_mov_b32_e32 v1, s7
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s12, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s6, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_and_b32 s7, s4, 1
; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-NEXT: s_cmp_lg_u32 s7, 0
+; GFX6-NEXT: s_cselect_b32 s6, s12, s6
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_ashr_i32 s4, s3, 31
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v1, s4
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_mov_b32_e32 v8, s2
-; GFX6-NEXT: v_mov_b32_e32 v9, s3
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v4
-; GFX6-NEXT: v_readfirstlane_b32 s1, v5
-; GFX6-NEXT: v_readfirstlane_b32 s2, v6
-; GFX6-NEXT: v_readfirstlane_b32 s3, v7
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: v_readfirstlane_b32 s5, v2
-; GFX6-NEXT: v_readfirstlane_b32 s6, v1
-; GFX6-NEXT: v_readfirstlane_b32 s7, v3
+; GFX6-NEXT: s_cselect_b32 s7, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_and_b32 s4, s4, 1
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s4, 0, s7
+; GFX6-NEXT: s_xor_b32 s12, s4, s6
+; GFX6-NEXT: s_ashr_i32 s4, s11, 31
+; GFX6-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX6-NEXT: s_and_b32 s12, s12, 1
+; GFX6-NEXT: s_mov_b32 s5, s4
+; GFX6-NEXT: s_mov_b32 s6, s4
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_saddsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_add_u32 s8, s0, s8
-; GFX8-NEXT: s_addc_u32 s9, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_addc_u32 s16, s2, s10
+; GFX8-NEXT: s_addc_u32 s9, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s17, s3, s11
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_addc_u32 s16, s2, s10
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s17, s3, s11
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
+; GFX8-NEXT: s_cselect_b32 s18, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
+; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
+; GFX8-NEXT: s_cselect_b32 s2, s18, s19
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
-; GFX8-NEXT: s_and_b32 s0, 1, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s2
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_cselect_b32 s0, 0, s0
+; GFX8-NEXT: s_xor_b32 s10, s0, s2
; GFX8-NEXT: s_ashr_i32 s0, s17, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s16
-; GFX8-NEXT: v_mov_b32_e32 v3, s17
-; GFX8-NEXT: s_add_u32 s0, s4, s12
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX8-NEXT: s_addc_u32 s1, s5, s13
+; GFX8-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s10, s10, 1
+; GFX8-NEXT: s_mov_b32 s1, s0
+; GFX8-NEXT: s_mov_b32 s2, s0
+; GFX8-NEXT: s_cmp_lg_u32 s10, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX8-NEXT: s_add_u32 s8, s4, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_addc_u32 s2, s6, s14
+; GFX8-NEXT: s_addc_u32 s9, s5, s13
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_addc_u32 s3, s7, s15
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
+; GFX8-NEXT: s_addc_u32 s10, s6, s14
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: s_addc_u32 s11, s7, s15
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX8-NEXT: s_cselect_b32 s12, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
+; GFX8-NEXT: s_cselect_b32 s13, 1, 0
+; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
+; GFX8-NEXT: s_cselect_b32 s6, s12, s13
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8-NEXT: s_cselect_b32 s4, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: s_and_b32 s4, 1, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX8-NEXT: s_and_b32 s4, 1, s6
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_ashr_i32 s4, s3, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: v_readfirstlane_b32 s1, v5
-; GFX8-NEXT: v_readfirstlane_b32 s2, v6
-; GFX8-NEXT: v_readfirstlane_b32 s3, v7
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v2
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: s_cselect_b32 s4, 0, s4
+; GFX8-NEXT: s_xor_b32 s12, s4, s6
+; GFX8-NEXT: s_ashr_i32 s4, s11, 31
+; GFX8-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX8-NEXT: s_and_b32 s12, s12, 1
+; GFX8-NEXT: s_mov_b32 s5, s4
+; GFX8-NEXT: s_mov_b32 s6, s4
+; GFX8-NEXT: s_cmp_lg_u32 s12, 0
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_saddsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s8, s0, s8
-; GFX9-NEXT: s_addc_u32 s9, s1, s9
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_addc_u32 s16, s2, s10
+; GFX9-NEXT: s_addc_u32 s9, s1, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_addc_u32 s17, s3, s11
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_addc_u32 s16, s2, s10
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_addc_u32 s17, s3, s11
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
+; GFX9-NEXT: s_cselect_b32 s18, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
+; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s2, s18, s19
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[2:3]
-; GFX9-NEXT: s_and_b32 s0, 1, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s2
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_cselect_b32 s0, 0, s0
+; GFX9-NEXT: s_xor_b32 s10, s0, s2
; GFX9-NEXT: s_ashr_i32 s0, s17, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s16
-; GFX9-NEXT: v_mov_b32_e32 v3, s17
-; GFX9-NEXT: s_add_u32 s0, s4, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX9-NEXT: s_addc_u32 s1, s5, s13
+; GFX9-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s10, s10, 1
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_cmp_lg_u32 s10, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX9-NEXT: s_add_u32 s8, s4, s12
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: s_addc_u32 s2, s6, s14
+; GFX9-NEXT: s_addc_u32 s9, s5, s13
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s3, s7, s15
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
+; GFX9-NEXT: s_addc_u32 s10, s6, s14
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_addc_u32 s11, s7, s15
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX9-NEXT: s_cselect_b32 s12, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
+; GFX9-NEXT: s_cselect_b32 s13, 1, 0
+; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
+; GFX9-NEXT: s_cselect_b32 s6, s12, s13
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: s_and_b32 s4, 1, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT: s_and_b32 s4, 1, s6
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v4
-; GFX9-NEXT: v_readfirstlane_b32 s1, v5
-; GFX9-NEXT: v_readfirstlane_b32 s2, v6
-; GFX9-NEXT: v_readfirstlane_b32 s3, v7
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v3
+; GFX9-NEXT: s_cselect_b32 s4, 0, s4
+; GFX9-NEXT: s_xor_b32 s12, s4, s6
+; GFX9-NEXT: s_ashr_i32 s4, s11, 31
+; GFX9-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX9-NEXT: s_and_b32 s12, s12, 1
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_cmp_lg_u32 s12, 0
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_saddsat_v2i128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s8, s0, s8
-; GFX10-NEXT: s_addc_u32 s9, s1, s9
-; GFX10-NEXT: s_addc_u32 s16, s2, s10
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT: s_addc_u32 s17, s3, s11
-; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT: v_mov_b32_e32 v4, s17
-; GFX10-NEXT: s_cselect_b32 s18, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s18
-; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: s_ashr_i32 s10, s17, 31
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX10-NEXT: s_add_u32 s0, s4, s12
-; GFX10-NEXT: s_addc_u32 s1, s5, s13
-; GFX10-NEXT: s_addc_u32 s2, s6, s14
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT: s_addc_u32 s3, s7, s15
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v5, s0
-; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_mov_b32_e32 v6, s2
-; GFX10-NEXT: v_mov_b32_e32 v7, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s4, 1, s12
-; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT: s_and_b32 s5, 1, s5
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s9
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: s_ashr_i32 s4, s3, 31
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s11, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX10-NEXT: v_readfirstlane_b32 s3, v4
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v0
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s0, v3
-; GFX10-NEXT: v_readfirstlane_b32 s4, v5
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s6, v6
-; GFX10-NEXT: v_readfirstlane_b32 s7, v7
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_saddsat_v2i128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_u32 s8, s0, s8
-; GFX11-NEXT: s_addc_u32 s9, s1, s9
-; GFX11-NEXT: s_addc_u32 s16, s2, s10
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX11-NEXT: s_addc_u32 s17, s3, s11
-; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX11-NEXT: s_cselect_b32 s18, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s18
-; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_ashr_i32 s10, s17, 31
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT: s_add_u32 s0, s4, s12
-; GFX11-NEXT: s_addc_u32 s1, s5, s13
-; GFX11-NEXT: s_addc_u32 s2, s6, s14
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX11-NEXT: s_addc_u32 s3, s7, s15
-; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX11-NEXT: v_mov_b32_e32 v4, s17
-; GFX11-NEXT: s_cselect_b32 s12, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[14:15], 0
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT: v_mov_b32_e32 v5, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX11-NEXT: s_and_b32 s4, 1, s12
-; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX11-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX11-NEXT: s_and_b32 s5, 1, s5
-; GFX11-NEXT: v_cmp_ne_u32_e64 s4, 0, s5
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, 0, s4
-; GFX11-NEXT: v_mov_b32_e32 v3, s8
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: s_ashr_i32 s4, s3, 31
-; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v0, s9
-; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s11, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, vcc_lo
-; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX11-NEXT: v_readfirstlane_b32 s3, v4
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v3
-; GFX11-NEXT: v_readfirstlane_b32 s4, v5
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_saddsat_v2i128:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_add_u32 s8, s0, s8
+; GFX10PLUS-NEXT: s_addc_u32 s9, s1, s9
+; GFX10PLUS-NEXT: s_addc_u32 s16, s2, s10
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
+; GFX10PLUS-NEXT: s_addc_u32 s17, s3, s11
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s18, s[10:11], 0
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[16:17], s[2:3]
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
+; GFX10PLUS-NEXT: s_cselect_b32 s1, s0, s1
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s18, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, 0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s17, 31
+; GFX10PLUS-NEXT: s_xor_b32 s2, s2, s1
+; GFX10PLUS-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX10PLUS-NEXT: s_and_b32 s10, s2, 1
+; GFX10PLUS-NEXT: s_mov_b32 s1, s0
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s10, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[16:17]
+; GFX10PLUS-NEXT: s_add_u32 s8, s4, s12
+; GFX10PLUS-NEXT: s_addc_u32 s9, s5, s13
+; GFX10PLUS-NEXT: s_addc_u32 s10, s6, s14
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5]
+; GFX10PLUS-NEXT: s_addc_u32 s11, s7, s15
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s12, s[14:15], 0
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s5, s[10:11], s[6:7]
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s5, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
+; GFX10PLUS-NEXT: s_cselect_b32 s5, s4, s5
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s6, 0, s4
+; GFX10PLUS-NEXT: s_ashr_i32 s4, s11, 31
+; GFX10PLUS-NEXT: s_xor_b32 s6, s6, s5
+; GFX10PLUS-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX10PLUS-NEXT: s_and_b32 s12, s6, 1
+; GFX10PLUS-NEXT: s_mov_b32 s5, s4
+; GFX10PLUS-NEXT: s_mov_b32 s6, s4
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index c1b225562b77b..0a67d1a84ed7a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_ssubsat_i7:
@@ -98,8 +98,9 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 9
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ssubsat_i7:
@@ -107,8 +108,9 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@@ -207,8 +209,9 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ssubsat_i8:
@@ -216,8 +219,9 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@@ -436,58 +440,48 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_sext_i32_i16 s1, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_sext_i32_i16 s2, 0x80008
+; GFX9-NEXT: s_ashr_i32 s1, s1, s2
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX9-NEXT: s_lshr_b32 s1, s0, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_ssubsat_v2i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s2, s0, 8
-; GFX10-NEXT: s_lshr_b32 s3, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: s_lshr_b32 s2, s0, 16
-; GFX10-NEXT: s_lshr_b32 s3, s1, 16
-; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s2, s2, 8
-; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s3, s3, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_ssubsat_v2i8:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s2, s0, 8
-; GFX11-NEXT: s_lshr_b32 s3, s1, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s2, 8
-; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s3, s3, 8
-; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX11-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_ssubsat_v2i8:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
+; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16
+; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 0x80008
+; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 0x80008
+; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3
+; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
+; GFX10PLUS-NEXT: s_sext_i32_i16 s1, 0x80008
+; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10PLUS-NEXT: s_ashr_i32 s1, s2, s1
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX10PLUS-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
+; GFX10PLUS-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
%result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
@@ -886,66 +880,89 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_pk_sub_i16 v1, s2, v1 clamp
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 clamp
+; GFX9-NEXT: s_sext_i32_i16 s2, s0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 16
+; GFX9-NEXT: s_sext_i32_i16 s3, 0x80008
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX9-NEXT: s_sext_i32_i16 s2, s1
+; GFX9-NEXT: s_ashr_i32 s1, s1, 16
+; GFX9-NEXT: s_ashr_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s1, s1, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX9-NEXT: s_and_b32 s2, s0, 0xff
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT: s_lshl_b32 s0, s0, 8
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, 24
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_ssubsat_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
-; GFX10-NEXT: s_lshr_b32 s5, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s2, 16
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-NEXT: s_lshr_b32 s4, s1, 16
-; GFX10-NEXT: s_lshr_b32 s5, s3, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s5, s4, 16
+; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_lshl_b32 s4, s4, 0x80008
+; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp
-; GFX10-NEXT: v_mov_b32_e32 v2, 8
-; GFX10-NEXT: v_mov_b32_e32 v4, 24
-; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s3
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5
+; GFX10-NEXT: s_sext_i32_i16 s3, 0x80008
+; GFX10-NEXT: v_pk_sub_i16 v1, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: s_sext_i32_i16 s2, s0
+; GFX10-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10-NEXT: s_ashr_i32 s2, s2, s3
+; GFX10-NEXT: s_ashr_i32 s0, s0, 8
+; GFX10-NEXT: s_sext_i32_i16 s4, s1
+; GFX10-NEXT: s_ashr_i32 s1, s1, 16
+; GFX10-NEXT: s_ashr_i32 s3, s4, s3
+; GFX10-NEXT: s_ashr_i32 s1, s1, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 16
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 24
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_ssubsat_v4i8:
@@ -965,28 +982,40 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 clamp
; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 clamp
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX11-NEXT: s_sext_i32_i16 s3, 0x80008
; GFX11-NEXT: v_pk_sub_i16 v1, s0, s1 clamp
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-NEXT: s_sext_i32_i16 s2, s0
+; GFX11-NEXT: s_ashr_i32 s0, s0, 16
+; GFX11-NEXT: s_ashr_i32 s2, s2, s3
+; GFX11-NEXT: s_ashr_i32 s0, s0, 8
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: s_ashr_i32 s1, s1, 16
+; GFX11-NEXT: s_ashr_i32 s3, s4, s3
+; GFX11-NEXT: s_ashr_i32 s1, s1, 8
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX11-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-NEXT: s_and_b32 s3, s1, 0xff
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s2, s3, 16
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX11-NEXT: s_or_b32 s0, s0, s2
+; GFX11-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1085,8 +1114,8 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp
-; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_ashr_i32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_ssubsat_i24:
@@ -1094,8 +1123,8 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
@@ -4095,9 +4124,9 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc
-; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5
@@ -4113,9 +4142,9 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc
-; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v5, v6, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 16
; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5
@@ -4183,51 +4212,49 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-LABEL: s_ssubsat_i48:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sub_u32 s4, s0, s2
-; GFX6-NEXT: s_subb_u32 s3, s1, s3
+; GFX6-NEXT: s_subb_u32 s5, s1, s3
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
-; GFX6-NEXT: s_ashr_i32 s2, s7, 31
-; GFX6-NEXT: s_ashr_i32 s5, s7, 15
-; GFX6-NEXT: s_addk_i32 s2, 0x8000
-; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mov_b32_e32 v1, s2
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_xor_b32 s2, s0, s3
+; GFX6-NEXT: s_ashr_i32 s1, s7, 31
+; GFX6-NEXT: s_ashr_i32 s0, s7, 15
+; GFX6-NEXT: s_addk_i32 s1, 0x8000
+; GFX6-NEXT: s_and_b32 s2, s2, 1
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ssubsat_i48:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s4, s0, s2
-; GFX8-NEXT: s_subb_u32 s3, s1, s3
+; GFX8-NEXT: s_subb_u32 s5, s1, s3
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1]
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
-; GFX8-NEXT: s_ashr_i32 s2, s7, 31
-; GFX8-NEXT: s_ashr_i32 s5, s7, 15
-; GFX8-NEXT: s_addk_i32 s2, 0x8000
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_xor_b32 s2, s0, s3
+; GFX8-NEXT: s_ashr_i32 s1, s7, 31
+; GFX8-NEXT: s_ashr_i32 s0, s7, 15
+; GFX8-NEXT: s_addk_i32 s1, 0x8000
+; GFX8-NEXT: s_and_b32 s2, s2, 1
+; GFX8-NEXT: s_cmp_lg_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ssubsat_i48:
@@ -4240,58 +4267,39 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_xor_b32 s2, s0, s6
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s2, s2, 1
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 16
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_ssubsat_i48:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GFX10-NEXT: s_sub_u32 s4, s0, s2
-; GFX10-NEXT: s_subb_u32 s5, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX10-NEXT: s_xor_b32 s0, s1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_ssubsat_i48:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
-; GFX11-NEXT: s_sub_u32 s4, s0, s2
-; GFX11-NEXT: s_subb_u32 s5, s1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX11-NEXT: s_xor_b32 s0, s1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_ssubsat_i48:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16
+; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2
+; GFX10PLUS-NEXT: s_subb_u32 s5, s1, s3
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s1, s1, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10PLUS-NEXT: s_and_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_ashr_i64 s[0:1], s[0:1], 16
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
ret i48 %result
}
@@ -4302,9 +4310,9 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0
; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v4, 0, 16
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
@@ -4321,9 +4329,9 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
+; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 16
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
@@ -4396,16 +4404,18 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0
-; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
-; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
+; GFX6-NEXT: v_bfe_i32 v3, v4, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3
; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -4415,16 +4425,18 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
+; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 16
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
@@ -4436,9 +4448,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], exec, 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
@@ -4457,6 +4471,8 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -4474,6 +4490,8 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
@@ -4567,18 +4585,17 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-NEXT: s_subb_u32 s5, s1, s3
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX6-NEXT: s_ashr_i32 s2, s5, 31
-; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
+; GFX6-NEXT: s_cselect_b32 s6, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_xor_b32 s2, s0, s6
+; GFX6-NEXT: s_ashr_i32 s0, s5, 31
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s2, s2, 1
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ssubsat_i64:
@@ -4589,17 +4606,16 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX8-NEXT: s_ashr_i32 s2, s5, 31
-; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s6, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_xor_b32 s2, s0, s6
+; GFX8-NEXT: s_ashr_i32 s0, s5, 31
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s2, s2, 1
+; GFX8-NEXT: s_cmp_lg_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ssubsat_i64:
@@ -4610,51 +4626,35 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s6, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_xor_b32 s2, s0, s6
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s2, s2, 1
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_ssubsat_i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sub_u32 s4, s0, s2
-; GFX10-NEXT: s_subb_u32 s5, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX10-NEXT: s_xor_b32 s0, s1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_ssubsat_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_sub_u32 s4, s0, s2
-; GFX11-NEXT: s_subb_u32 s5, s1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
-; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
-; GFX11-NEXT: s_xor_b32 s0, s1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_ssubsat_i64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2
+; GFX10PLUS-NEXT: s_subb_u32 s5, s1, s3
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s1, s1, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10PLUS-NEXT: s_and_b32 s2, s1, 1
+; GFX10PLUS-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
ret i64 %result
}
@@ -4736,12 +4736,14 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v3, s1
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX6-NEXT: ; return to shader part epilog
@@ -4750,12 +4752,14 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT: s_cselect_b64 s[2:3], exec, 0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-NEXT: ; return to shader part epilog
@@ -4764,9 +4768,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], exec, 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
@@ -4781,6 +4787,8 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -4794,6 +4802,8 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, exec_lo, 0
; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
@@ -4930,35 +4940,33 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
-; GFX6-NEXT: s_ashr_i32 s4, s9, 31
-; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX6-NEXT: s_sub_u32 s0, s2, s6
; GFX6-NEXT: v_mov_b32_e32 v0, s2
-; GFX6-NEXT: s_subb_u32 s1, s3, s7
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
+; GFX6-NEXT: s_cselect_b32 s10, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_xor_b32 s4, s0, s10
+; GFX6-NEXT: s_ashr_i32 s0, s9, 31
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s4, s4, 1
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: s_sub_u32 s4, s2, s6
+; GFX6-NEXT: s_subb_u32 s5, s3, s7
; GFX6-NEXT: v_mov_b32_e32 v1, s3
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX6-NEXT: s_or_b64 s[2:3], vcc, vcc
; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
-; GFX6-NEXT: s_ashr_i32 s4, s1, 31
-; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: v_mov_b32_e32 v4, s0
-; GFX6-NEXT: v_mov_b32_e32 v5, s1
-; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v2
-; GFX6-NEXT: v_readfirstlane_b32 s1, v3
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: v_readfirstlane_b32 s3, v1
+; GFX6-NEXT: s_cselect_b32 s8, 1, 0
+; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[2:3]
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_xor_b32 s6, s2, s8
+; GFX6-NEXT: s_ashr_i32 s2, s5, 31
+; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX6-NEXT: s_and_b32 s6, s6, 1
+; GFX6-NEXT: s_cmp_lg_u32 s6, 0
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ssubsat_v2i64:
@@ -4969,34 +4977,32 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
-; GFX8-NEXT: s_ashr_i32 s4, s9, 31
-; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX8-NEXT: s_sub_u32 s0, s2, s6
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s10, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_xor_b32 s4, s0, s10
+; GFX8-NEXT: s_ashr_i32 s0, s9, 31
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s4, s4, 1
+; GFX8-NEXT: s_cmp_lg_u32 s4, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: s_sub_u32 s4, s2, s6
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_subb_u32 s1, s3, s7
+; GFX8-NEXT: s_subb_u32 s5, s3, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
-; GFX8-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v2
-; GFX8-NEXT: v_readfirstlane_b32 s1, v3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: s_cselect_b32 s8, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT: s_cselect_b32 s2, 1, 0
+; GFX8-NEXT: s_xor_b32 s6, s2, s8
+; GFX8-NEXT: s_ashr_i32 s2, s5, 31
+; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX8-NEXT: s_and_b32 s6, s6, 1
+; GFX8-NEXT: s_cmp_lg_u32 s6, 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ssubsat_v2i64:
@@ -5007,93 +5013,65 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
-; GFX9-NEXT: s_ashr_i32 s4, s9, 31
-; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
-; GFX9-NEXT: s_sub_u32 s0, s2, s6
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s10, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_xor_b32 s4, s0, s10
+; GFX9-NEXT: s_ashr_i32 s0, s9, 31
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s4, s4, 1
+; GFX9-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_subb_u32 s1, s3, s7
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
-; GFX9-NEXT: s_ashr_i32 s4, s1, 31
-; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v4, s0
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v2
-; GFX9-NEXT: v_readfirstlane_b32 s1, v3
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: s_cselect_b32 s8, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_xor_b32 s6, s2, s8
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX9-NEXT: s_and_b32 s6, s6, 1
+; GFX9-NEXT: s_cmp_lg_u32 s6, 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_ssubsat_v2i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sub_u32 s8, s0, s4
-; GFX10-NEXT: s_subb_u32 s9, s1, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
-; GFX10-NEXT: s_ashr_i32 s4, s9, 31
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_add_i32 s8, s4, 0x80000000
-; GFX10-NEXT: s_xor_b32 s5, s1, s0
-; GFX10-NEXT: s_sub_u32 s0, s2, s6
-; GFX10-NEXT: s_subb_u32 s1, s3, s7
-; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
-; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
-; GFX10-NEXT: v_mov_b32_e32 v3, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5
-; GFX10-NEXT: s_ashr_i32 s4, s1, 31
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, s5
-; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX10-NEXT: s_xor_b32 s1, s3, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_ssubsat_v2i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_sub_u32 s8, s0, s4
-; GFX11-NEXT: s_subb_u32 s9, s1, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
-; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
-; GFX11-NEXT: s_ashr_i32 s4, s9, 31
-; GFX11-NEXT: s_add_i32 s8, s4, 0x80000000
-; GFX11-NEXT: s_xor_b32 s5, s1, s0
-; GFX11-NEXT: s_sub_u32 s0, s2, s6
-; GFX11-NEXT: s_subb_u32 s1, s3, s7
-; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3]
-; GFX11-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5
-; GFX11-NEXT: s_ashr_i32 s4, s1, 31
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, s5
-; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX11-NEXT: s_xor_b32 s1, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_ssubsat_v2i64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4
+; GFX10PLUS-NEXT: s_subb_u32 s9, s1, s5
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s1, s1, s0
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31
+; GFX10PLUS-NEXT: s_and_b32 s4, s1, 1
+; GFX10PLUS-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX10PLUS-NEXT: s_sub_u32 s4, s2, s6
+; GFX10PLUS-NEXT: s_subb_u32 s5, s3, s7
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10PLUS-NEXT: s_xor_b32 s3, s3, s2
+; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 31
+; GFX10PLUS-NEXT: s_and_b32 s6, s3, 1
+; GFX10PLUS-NEXT: s_add_i32 s3, s2, 0x80000000
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s6, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result
}
@@ -5106,223 +5084,150 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: s_subb_u32 s9, s1, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_subb_u32 s10, s2, s6
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_subb_u32 s11, s3, s7
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s12, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s3, s0, 1
; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v3, s9
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_cselect_b32 s2, s12, s2
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s0, s0, 1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_cselect_b32 s0, s3, s4
+; GFX6-NEXT: s_xor_b32 s4, s0, s2
; GFX6-NEXT: s_ashr_i32 s0, s11, 31
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_mov_b32_e32 v4, s10
-; GFX6-NEXT: v_mov_b32_e32 v5, s11
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s2, v1
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
+; GFX6-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s4, s4, 1
+; GFX6-NEXT: s_mov_b32 s1, s0
+; GFX6-NEXT: s_mov_b32 s2, s0
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ssubsat_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s8, s0, s4
-; GFX8-NEXT: s_subb_u32 s9, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_subb_u32 s10, s2, s6
+; GFX8-NEXT: s_subb_u32 s9, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_subb_u32 s11, s3, s7
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_subb_u32 s10, s2, s6
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_subb_u32 s11, s3, s7
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX8-NEXT: s_cselect_b32 s12, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
+; GFX8-NEXT: s_cselect_b32 s13, 1, 0
; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
+; GFX8-NEXT: s_cselect_b32 s2, s12, s13
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
+; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b32 s0, 1, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s2
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_cselect_b32 s0, s3, s0
+; GFX8-NEXT: s_xor_b32 s4, s0, s2
; GFX8-NEXT: s_ashr_i32 s0, s11, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s10
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v2
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s4, s4, 1
+; GFX8-NEXT: s_mov_b32 s1, s0
+; GFX8-NEXT: s_mov_b32 s2, s0
+; GFX8-NEXT: s_cmp_lg_u32 s4, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ssubsat_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s8, s0, s4
-; GFX9-NEXT: s_subb_u32 s9, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_subb_u32 s10, s2, s6
+; GFX9-NEXT: s_subb_u32 s9, s1, s5
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_subb_u32 s11, s3, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_subb_u32 s10, s2, s6
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_subb_u32 s11, s3, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX9-NEXT: s_cselect_b32 s12, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
+; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s2, s12, s13
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
+; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3]
-; GFX9-NEXT: s_and_b32 s0, 1, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s2
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_cselect_b32 s0, s3, s0
+; GFX9-NEXT: s_xor_b32 s4, s0, s2
; GFX9-NEXT: s_ashr_i32 s0, s11, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_mov_b32_e32 v4, s10
-; GFX9-NEXT: v_mov_b32_e32 v5, s11
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v2
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v3
+; GFX9-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s4, s4, 1
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_cmp_lg_u32 s4, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_ssubsat_i128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sub_u32 s8, s0, s4
-; GFX10-NEXT: s_subb_u32 s9, s1, s5
-; GFX10-NEXT: s_subb_u32 s10, s2, s6
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX10-NEXT: s_subb_u32 s11, s3, s7
-; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX10-NEXT: s_cselect_b32 s12, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
-; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s12
-; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_ashr_i32 s0, s11, 31
-; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v2, s10
-; GFX10-NEXT: v_mov_b32_e32 v3, s11
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v1, s8
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s9
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v0
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_ssubsat_i128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_sub_u32 s8, s0, s4
-; GFX11-NEXT: s_subb_u32 s9, s1, s5
-; GFX11-NEXT: s_subb_u32 s10, s2, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
-; GFX11-NEXT: s_subb_u32 s11, s3, s7
-; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
-; GFX11-NEXT: s_cselect_b32 s12, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
-; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s12
-; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
-; GFX11-NEXT: s_ashr_i32 s0, s11, 31
-; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s10
-; GFX11-NEXT: v_mov_b32_e32 v3, s11
-; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s9
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_ssubsat_i128:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4
+; GFX10PLUS-NEXT: s_subb_u32 s9, s1, s5
+; GFX10PLUS-NEXT: s_subb_u32 s10, s2, s6
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
+; GFX10PLUS-NEXT: s_subb_u32 s11, s3, s7
+; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s4, s[4:5], 0
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, s0, s1
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[6:7], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s0, s2
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s11, 31
+; GFX10PLUS-NEXT: s_xor_b32 s2, s2, s1
+; GFX10PLUS-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX10PLUS-NEXT: s_and_b32 s4, s2, 1
+; GFX10PLUS-NEXT: s_mov_b32 s1, s0
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
ret i128 %result
}
@@ -5351,9 +5256,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5384,10 +5288,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
@@ -5417,9 +5319,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
@@ -5447,8 +5348,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
@@ -5475,8 +5375,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
@@ -5491,28 +5390,33 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v5, s1
; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v0
+; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_mov_b32_e32 v6, s2
+; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_mov_b32_e32 v7, s3
; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc
; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX6-NEXT: s_cselect_b32 s5, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX6-NEXT: s_and_b32 s0, s0, 1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_cselect_b32 s0, s4, s5
+; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5533,23 +5437,21 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX8-NEXT: s_cselect_b32 s4, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: s_cselect_b32 s0, 1, 0
+; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s4
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_cselect_b32 s0, s4, s0
+; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
@@ -5569,22 +5471,21 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 1, 0
+; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s4
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
@@ -5599,24 +5500,23 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
-; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
-; GFX10-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
+; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s4
+; GFX10-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX10-NEXT: s_cselect_b32 s0, s0, s1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
@@ -5631,27 +5531,25 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_subrev_co_ci_u32_e64 v7, null, s3, v3, vcc_lo
; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
-; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
-; GFX11-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
+; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_cselect_b32 s0, 1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s4
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT: s_cselect_b32 s1, 1, 0
+; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0
+; GFX11-NEXT: s_cselect_b32 s0, s0, s1
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
+; GFX11-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@@ -5681,8 +5579,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v19
; GFX6-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
@@ -5706,8 +5604,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11
-; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6
; GFX6-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
@@ -5738,8 +5636,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v19
; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
@@ -5760,11 +5657,10 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11
+; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX8-NEXT: v_cmp_ne_u16_e32 vcc, 0, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
@@ -5792,9 +5688,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc
@@ -5817,9 +5712,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
-; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT: v_cmp_ne_u16_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
@@ -5853,7 +5747,6 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7]
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
@@ -5863,17 +5756,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19
; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19
-; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
+; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v3
+; GFX10-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u16_e64 s4, 0, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4
@@ -5917,16 +5809,15 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19
; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
-; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
+; GFX11-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v3
+; GFX11-NEXT: v_xor_b32_e32 v2, v2, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_cndmask_b32 v3, v19, v4
; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0
@@ -5944,430 +5835,286 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_subb_u32 s17, s1, s9
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: s_subb_u32 s18, s2, s10
-; GFX6-NEXT: v_mov_b32_e32 v2, s2
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: s_subb_u32 s19, s3, s11
-; GFX6-NEXT: v_mov_b32_e32 v3, s3
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s20, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s2, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s3, s0, 1
; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v3, s17
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX6-NEXT: s_cmp_lg_u32 s3, 0
+; GFX6-NEXT: s_cselect_b32 s2, s20, s2
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX6-NEXT: s_cselect_b32 s3, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
+; GFX6-NEXT: s_cselect_b32 s8, 1, 0
+; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[0:1]
+; GFX6-NEXT: s_cselect_b32 s0, 1, 0
+; GFX6-NEXT: s_and_b32 s0, s0, 1
+; GFX6-NEXT: s_cmp_lg_u32 s0, 0
+; GFX6-NEXT: s_cselect_b32 s0, s3, s8
+; GFX6-NEXT: s_xor_b32 s8, s0, s2
; GFX6-NEXT: s_ashr_i32 s0, s19, 31
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v1, s0
-; GFX6-NEXT: v_mov_b32_e32 v2, s16
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v0, s1
-; GFX6-NEXT: v_mov_b32_e32 v2, s18
-; GFX6-NEXT: v_mov_b32_e32 v3, s19
-; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX6-NEXT: s_sub_u32 s0, s4, s12
+; GFX6-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX6-NEXT: s_and_b32 s8, s8, 1
+; GFX6-NEXT: s_mov_b32 s1, s0
+; GFX6-NEXT: s_mov_b32 s2, s0
+; GFX6-NEXT: s_cmp_lg_u32 s8, 0
+; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[16:17]
+; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX6-NEXT: s_sub_u32 s8, s4, s12
; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: s_subb_u32 s1, s5, s13
+; GFX6-NEXT: s_subb_u32 s9, s5, s13
; GFX6-NEXT: v_mov_b32_e32 v1, s5
-; GFX6-NEXT: s_subb_u32 s2, s6, s14
-; GFX6-NEXT: v_mov_b32_e32 v2, s6
-; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX6-NEXT: s_subb_u32 s3, s7, s15
-; GFX6-NEXT: v_mov_b32_e32 v3, s7
-; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX6-NEXT: s_subb_u32 s10, s6, s14
+; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_mov_b32_e32 v0, s6
+; GFX6-NEXT: s_subb_u32 s11, s7, s15
+; GFX6-NEXT: v_mov_b32_e32 v1, s7
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, vcc
+; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s16, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, vcc
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
+; GFX6-NEXT: s_cselect_b32 s6, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], vcc, vcc
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_and_b32 s7, s4, 1
; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
-; GFX6-NEXT: v_mov_b32_e32 v3, s1
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX6-NEXT: s_cmp_lg_u32 s7, 0
+; GFX6-NEXT: s_cselect_b32 s6, s16, s6
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
-; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0
-; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX6-NEXT: s_ashr_i32 s4, s3, 31
-; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX6-NEXT: v_mov_b32_e32 v1, s4
-; GFX6-NEXT: v_mov_b32_e32 v2, s0
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_mov_b32_e32 v8, s2
-; GFX6-NEXT: v_mov_b32_e32 v9, s3
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX6-NEXT: v_readfirstlane_b32 s0, v4
-; GFX6-NEXT: v_readfirstlane_b32 s1, v5
-; GFX6-NEXT: v_readfirstlane_b32 s2, v6
-; GFX6-NEXT: v_readfirstlane_b32 s3, v7
-; GFX6-NEXT: v_readfirstlane_b32 s4, v0
-; GFX6-NEXT: v_readfirstlane_b32 s5, v2
-; GFX6-NEXT: v_readfirstlane_b32 s6, v1
-; GFX6-NEXT: v_readfirstlane_b32 s7, v3
+; GFX6-NEXT: s_cselect_b32 s7, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
+; GFX6-NEXT: s_cselect_b32 s12, 1, 0
+; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
+; GFX6-NEXT: s_cselect_b32 s4, 1, 0
+; GFX6-NEXT: s_and_b32 s4, s4, 1
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s4, s7, s12
+; GFX6-NEXT: s_xor_b32 s12, s4, s6
+; GFX6-NEXT: s_ashr_i32 s4, s11, 31
+; GFX6-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX6-NEXT: s_and_b32 s12, s12, 1
+; GFX6-NEXT: s_mov_b32 s5, s4
+; GFX6-NEXT: s_mov_b32 s6, s4
+; GFX6-NEXT: s_cmp_lg_u32 s12, 0
+; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ssubsat_v2i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_sub_u32 s16, s0, s8
-; GFX8-NEXT: s_subb_u32 s17, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_subb_u32 s18, s2, s10
+; GFX8-NEXT: s_subb_u32 s17, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_subb_u32 s19, s3, s11
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_subb_u32 s18, s2, s10
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_subb_u32 s19, s3, s11
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
+; GFX8-NEXT: s_cselect_b32 s20, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
+; GFX8-NEXT: s_cselect_b32 s21, 1, 0
; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
+; GFX8-NEXT: s_cselect_b32 s2, s20, s21
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
+; GFX8-NEXT: s_cselect_b32 s3, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
-; GFX8-NEXT: s_and_b32 s0, 1, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
-; GFX8-NEXT: s_cselect_b32 s2, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT: s_and_b32 s0, 1, s2
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_cselect_b32 s0, s3, s0
+; GFX8-NEXT: s_xor_b32 s8, s0, s2
; GFX8-NEXT: s_ashr_i32 s0, s19, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s16
-; GFX8-NEXT: v_mov_b32_e32 v3, s17
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s18
-; GFX8-NEXT: v_mov_b32_e32 v3, s19
-; GFX8-NEXT: s_sub_u32 s0, s4, s12
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX8-NEXT: s_subb_u32 s1, s5, s13
+; GFX8-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX8-NEXT: s_and_b32 s8, s8, 1
+; GFX8-NEXT: s_mov_b32 s1, s0
+; GFX8-NEXT: s_mov_b32 s2, s0
+; GFX8-NEXT: s_cmp_lg_u32 s8, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[16:17]
+; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX8-NEXT: s_sub_u32 s8, s4, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_subb_u32 s2, s6, s14
+; GFX8-NEXT: s_subb_u32 s9, s5, s13
; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_subb_u32 s3, s7, s15
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX8-NEXT: s_cselect_b32 s4, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX8-NEXT: s_and_b32 s4, 1, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX8-NEXT: s_subb_u32 s10, s6, s14
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: s_subb_u32 s11, s7, s15
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX8-NEXT: s_cselect_b32 s16, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 vcc, 0
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT: s_cselect_b32 s17, 1, 0
+; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
+; GFX8-NEXT: s_cselect_b32 s6, s16, s17
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
-; GFX8-NEXT: s_cselect_b32 s6, 1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX8-NEXT: s_and_b32 s4, 1, s6
-; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_ashr_i32 s4, s3, 31
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX8-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: v_readfirstlane_b32 s1, v5
-; GFX8-NEXT: v_readfirstlane_b32 s2, v6
-; GFX8-NEXT: v_readfirstlane_b32 s3, v7
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v2
-; GFX8-NEXT: v_readfirstlane_b32 s6, v1
-; GFX8-NEXT: v_readfirstlane_b32 s7, v3
+; GFX8-NEXT: s_cselect_b32 s7, 1, 0
+; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cselect_b32 s4, 1, 0
+; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX8-NEXT: s_cselect_b32 s4, s7, s4
+; GFX8-NEXT: s_xor_b32 s12, s4, s6
+; GFX8-NEXT: s_ashr_i32 s4, s11, 31
+; GFX8-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX8-NEXT: s_and_b32 s12, s12, 1
+; GFX8-NEXT: s_mov_b32 s5, s4
+; GFX8-NEXT: s_mov_b32 s6, s4
+; GFX8-NEXT: s_cmp_lg_u32 s12, 0
+; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_ssubsat_v2i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_sub_u32 s16, s0, s8
-; GFX9-NEXT: s_subb_u32 s17, s1, s9
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_subb_u32 s18, s2, s10
+; GFX9-NEXT: s_subb_u32 s17, s1, s9
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_subb_u32 s19, s3, s11
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_subb_u32 s18, s2, s10
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_subb_u32 s19, s3, s11
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
+; GFX9-NEXT: s_cselect_b32 s20, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
+; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s2, s20, s21
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
+; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3]
-; GFX9-NEXT: s_and_b32 s0, 1, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
-; GFX9-NEXT: s_cselect_b32 s2, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT: s_and_b32 s0, 1, s2
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_cselect_b32 s0, s3, s0
+; GFX9-NEXT: s_xor_b32 s8, s0, s2
; GFX9-NEXT: s_ashr_i32 s0, s19, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s16
-; GFX9-NEXT: v_mov_b32_e32 v3, s17
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s18
-; GFX9-NEXT: v_mov_b32_e32 v3, s19
-; GFX9-NEXT: s_sub_u32 s0, s4, s12
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc
-; GFX9-NEXT: s_subb_u32 s1, s5, s13
+; GFX9-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX9-NEXT: s_and_b32 s8, s8, 1
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_mov_b32 s2, s0
+; GFX9-NEXT: s_cmp_lg_u32 s8, 0
+; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[16:17]
+; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX9-NEXT: s_sub_u32 s8, s4, s12
; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: s_subb_u32 s2, s6, s14
+; GFX9-NEXT: s_subb_u32 s9, s5, s13
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s3, s7, s15
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX9-NEXT: s_cselect_b32 s4, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
-; GFX9-NEXT: s_and_b32 s4, 1, s4
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
+; GFX9-NEXT: s_subb_u32 s10, s6, s14
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_subb_u32 s11, s7, s15
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
+; GFX9-NEXT: s_cselect_b32 s16, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT: s_cselect_b32 s17, 1, 0
+; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
+; GFX9-NEXT: s_cselect_b32 s6, s16, s17
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
-; GFX9-NEXT: s_cselect_b32 s6, 1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX9-NEXT: s_and_b32 s4, 1, s6
-; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GFX9-NEXT: v_readfirstlane_b32 s0, v4
-; GFX9-NEXT: v_readfirstlane_b32 s1, v5
-; GFX9-NEXT: v_readfirstlane_b32 s2, v6
-; GFX9-NEXT: v_readfirstlane_b32 s3, v7
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v2
-; GFX9-NEXT: v_readfirstlane_b32 s6, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v3
+; GFX9-NEXT: s_cselect_b32 s7, 1, 0
+; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cselect_b32 s4, 1, 0
+; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_xor_b32 s12, s4, s6
+; GFX9-NEXT: s_ashr_i32 s4, s11, 31
+; GFX9-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX9-NEXT: s_and_b32 s12, s12, 1
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_cmp_lg_u32 s12, 0
+; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX9-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_ssubsat_v2i128:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sub_u32 s18, s0, s8
-; GFX10-NEXT: s_subb_u32 s19, s1, s9
-; GFX10-NEXT: s_subb_u32 s16, s2, s10
-; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
-; GFX10-NEXT: s_subb_u32 s17, s3, s11
-; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX10-NEXT: s_cselect_b32 s20, 1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT: s_and_b32 s0, 1, s20
-; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
-; GFX10-NEXT: s_ashr_i32 s8, s17, 31
-; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT: s_sub_u32 s0, s4, s12
-; GFX10-NEXT: s_subb_u32 s1, s5, s13
-; GFX10-NEXT: s_subb_u32 s2, s6, s14
-; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX10-NEXT: s_subb_u32 s3, s7, s15
-; GFX10-NEXT: v_mov_b32_e32 v5, s0
-; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v6, s2
-; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0
-; GFX10-NEXT: s_cselect_b32 s10, 1, 0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_mov_b32_e32 v7, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s4, 1, s10
-; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX10-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-NEXT: s_ashr_i32 s4, s3, 31
-; GFX10-NEXT: s_and_b32 s5, 1, s5
-; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v3, s18
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s19
-; GFX10-NEXT: v_mov_b32_e32 v4, s17
-; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, s16
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s9, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s3, v4
-; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v0
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s0, v3
-; GFX10-NEXT: v_readfirstlane_b32 s4, v5
-; GFX10-NEXT: v_readfirstlane_b32 s5, v1
-; GFX10-NEXT: v_readfirstlane_b32 s6, v6
-; GFX10-NEXT: v_readfirstlane_b32 s7, v7
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX11-LABEL: s_ssubsat_v2i128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_sub_u32 s18, s0, s8
-; GFX11-NEXT: s_subb_u32 s19, s1, s9
-; GFX11-NEXT: s_subb_u32 s16, s2, s10
-; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
-; GFX11-NEXT: s_subb_u32 s17, s3, s11
-; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
-; GFX11-NEXT: s_cselect_b32 s20, 1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX11-NEXT: s_and_b32 s0, 1, s20
-; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
-; GFX11-NEXT: s_ashr_i32 s8, s17, 31
-; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX11-NEXT: s_sub_u32 s0, s4, s12
-; GFX11-NEXT: s_subb_u32 s1, s5, s13
-; GFX11-NEXT: s_subb_u32 s2, s6, s14
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX11-NEXT: s_subb_u32 s3, s7, s15
-; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
-; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
-; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0
-; GFX11-NEXT: s_cselect_b32 s10, 1, 0
-; GFX11-NEXT: v_mov_b32_e32 v5, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX11-NEXT: s_and_b32 s4, 1, s10
-; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
-; GFX11-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_and_b32 s5, 1, s5
-; GFX11-NEXT: s_ashr_i32 s4, s3, 31
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, s17
-; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v0, s19
-; GFX11-NEXT: v_mov_b32_e32 v2, s16
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s9, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s3, v4
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
-; GFX11-NEXT: v_readfirstlane_b32 s2, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo
-; GFX11-NEXT: v_readfirstlane_b32 s0, v3
-; GFX11-NEXT: v_readfirstlane_b32 s4, v5
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v6
-; GFX11-NEXT: v_readfirstlane_b32 s7, v7
-; GFX11-NEXT: ; return to shader part epilog
+; GFX10PLUS-LABEL: s_ssubsat_v2i128:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_sub_u32 s16, s0, s8
+; GFX10PLUS-NEXT: s_subb_u32 s17, s1, s9
+; GFX10PLUS-NEXT: s_subb_u32 s18, s2, s10
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
+; GFX10PLUS-NEXT: s_subb_u32 s19, s3, s11
+; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s8, s[8:9], 0
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[18:19], s[2:3]
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s1, s0, s1
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s2, s0, s2
+; GFX10PLUS-NEXT: s_ashr_i32 s0, s19, 31
+; GFX10PLUS-NEXT: s_xor_b32 s2, s2, s1
+; GFX10PLUS-NEXT: s_add_i32 s3, s0, 0x80000000
+; GFX10PLUS-NEXT: s_and_b32 s8, s2, 1
+; GFX10PLUS-NEXT: s_mov_b32 s1, s0
+; GFX10PLUS-NEXT: s_mov_b32 s2, s0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s8, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[16:17]
+; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX10PLUS-NEXT: s_sub_u32 s8, s4, s12
+; GFX10PLUS-NEXT: s_subb_u32 s9, s5, s13
+; GFX10PLUS-NEXT: s_subb_u32 s10, s6, s14
+; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5]
+; GFX10PLUS-NEXT: s_subb_u32 s11, s7, s15
+; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s12, s[12:13], 0
+; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s5, s[10:11], s[6:7]
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s4, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s5, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
+; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s5, s4, s5
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s6, 0
+; GFX10PLUS-NEXT: s_cselect_b32 s6, 1, 0
+; GFX10PLUS-NEXT: s_cmp_eq_u64 s[14:15], 0
+; GFX10PLUS-NEXT: s_cselect_b32 s6, s4, s6
+; GFX10PLUS-NEXT: s_ashr_i32 s4, s11, 31
+; GFX10PLUS-NEXT: s_xor_b32 s6, s6, s5
+; GFX10PLUS-NEXT: s_add_i32 s7, s4, 0x80000000
+; GFX10PLUS-NEXT: s_and_b32 s12, s6, 1
+; GFX10PLUS-NEXT: s_mov_b32 s5, s4
+; GFX10PLUS-NEXT: s_mov_b32 s6, s4
+; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0
+; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9]
+; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
+; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
ret <2 x i128> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..2c9519fa9d8a5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_uaddsat_i7:
@@ -83,8 +83,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 9
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i7:
@@ -93,8 +94,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 9
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_uaddsat_i7:
@@ -102,8 +104,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, 9
; GFX10-NEXT: s_lshl_b32 s1, s1, 9
; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: s_lshr_b32 s0, s0, 9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_uaddsat_i7:
@@ -111,8 +114,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 9
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 9
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 9
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_uaddsat_i7:
@@ -120,8 +124,9 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 9
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 9
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 9
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@@ -204,8 +209,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i8:
@@ -214,8 +220,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_uaddsat_i8:
@@ -223,8 +230,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_uaddsat_i8:
@@ -232,8 +240,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 8
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_uaddsat_i8:
@@ -241,8 +250,9 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-FAKE16-NEXT: v_add_nc_u16 v0, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 8
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@@ -388,15 +398,19 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: s_lshl_b32 s1, s3, 8
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s2, 8
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: s_lshl_b32 s1, s2, 8
+; GFX8-NEXT: s_lshl_b32 s2, s3, 8
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_add_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s1, s1, 8
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
+; GFX8-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v2i8:
@@ -415,11 +429,17 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
-; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX9-NEXT: s_lshr_b32 s1, s0, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_uaddsat_v2i8:
@@ -436,12 +456,18 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX10-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_uaddsat_v2i8:
@@ -459,10 +485,14 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_uaddsat_v2i8:
@@ -480,13 +510,17 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -726,18 +760,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
; GFX6-NEXT: s_not_b32 s5, s3
-; GFX6-NEXT: s_min_u32 s4, s5, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
+; GFX6-NEXT: s_min_u32 s4, s5, s4
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_lshl_b32 s0, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s0, s3, 24
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 24
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_uaddsat_v4i8:
@@ -751,29 +785,37 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: s_lshl_b32 s1, s5, 8
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s2, 8
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT: s_lshl_b32 s1, s6, 8
-; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s3, 8
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_lshl_b32 s1, s7, 8
-; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s4, 8
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_lshl_b32 s1, s2, 8
+; GFX8-NEXT: s_lshl_b32 s2, s5, 8
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_add_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshl_b32 s2, s3, 8
+; GFX8-NEXT: s_lshl_b32 s3, s6, 8
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_add_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: s_lshl_b32 s3, s4, 8
+; GFX8-NEXT: s_lshl_b32 s4, s7, 8
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: v_add_u16_e64 v0, s3, v0 clamp
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s1, s1, 8
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
+; GFX8-NEXT: s_lshr_b32 s2, s2, 8
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NEXT: s_lshr_b32 s3, s3, 8
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_lshl_b32 s1, s3, 24
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v4i8:
@@ -803,66 +845,87 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 clamp
-; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0 clamp
+; GFX9-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xffff
+; GFX9-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX9-NEXT: s_lshr_b32 s1, s1, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX9-NEXT: s_and_b32 s2, s0, 0xff
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT: s_lshl_b32 s0, s0, 8
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, 24
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_uaddsat_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
-; GFX10-NEXT: s_lshr_b32 s5, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s2, 16
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-NEXT: s_lshr_b32 s4, s1, 16
-; GFX10-NEXT: s_lshr_b32 s5, s3, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s5, s4, 16
+; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_lshl_b32 s4, s4, 0x80008
+; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp
-; GFX10-NEXT: v_mov_b32_e32 v2, 8
-; GFX10-NEXT: v_mov_b32_e32 v4, 24
-; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s3
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5
+; GFX10-NEXT: v_pk_add_u16 v1, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX10-NEXT: s_lshr_b32 s1, s1, 16
+; GFX10-NEXT: s_lshr_b32 s3, s3, 0x80008
+; GFX10-NEXT: s_lshr_b32 s1, s1, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 16
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 24
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_uaddsat_v4i8:
@@ -882,30 +945,37 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3
-; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_uaddsat_v4i8:
@@ -925,28 +995,39 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, s2, s3 clamp
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX11-FAKE16-NEXT: v_pk_add_u16 v1, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 0x80008
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s3, 16
+; GFX11-FAKE16-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1014,8 +1095,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_i24:
@@ -1024,8 +1105,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_uaddsat_i24:
@@ -1033,8 +1114,8 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
@@ -2149,15 +2230,18 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
;
; GFX8-LABEL: s_uaddsat_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_add_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v2i16:
@@ -2362,24 +2446,30 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
;
; GFX8-LABEL: s_uaddsat_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_lshr_b32 s5, s1, 16
-; GFX8-NEXT: s_lshr_b32 s7, s3, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
-; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
+; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_add_u16_e64 v0, s4, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: s_lshr_b32 s7, s3, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_add_u16_e64 v0, s5, v0 clamp
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v4i16:
@@ -2540,33 +2630,42 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
;
; GFX8-LABEL: s_uaddsat_v6i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s6, s0, 16
-; GFX8-NEXT: s_lshr_b32 s7, s1, 16
-; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_lshr_b32 s10, s4, 16
-; GFX8-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s10
-; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
+; GFX8-NEXT: s_lshr_b32 s9, s3, 16
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
-; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp
-; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
+; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s9
+; GFX8-NEXT: v_add_u16_e64 v0, s6, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_lshr_b32 s10, s4, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s7, s1, 16
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_add_u16_e64 v0, s7, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: s_lshr_b32 s11, s5, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshr_b32 s8, s2, 16
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s11
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s8, v0 clamp
+; GFX8-NEXT: s_or_b32 s0, s0, s3
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s5
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v6i16:
@@ -2752,42 +2851,54 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
;
; GFX8-LABEL: s_uaddsat_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_lshr_b32 s9, s1, 16
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NEXT: v_add_u16_e64 v0, s8, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s9, s1, 16
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NEXT: v_add_u16_e64 v0, s9, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
-; GFX8-NEXT: s_lshr_b32 s11, s3, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s10, s2, 16
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NEXT: v_add_u16_e64 v0, s10, v0 clamp
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: v_readfirstlane_b32 s6, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, s12
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s13
-; GFX8-NEXT: v_mov_b32_e32 v4, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s14
-; GFX8-NEXT: v_mov_b32_e32 v6, s10
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: v_mov_b32_e32 v7, s15
-; GFX8-NEXT: v_mov_b32_e32 v8, s11
-; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
-; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp
-; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp
-; GFX8-NEXT: v_add_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v7
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: v_add_u16_e64 v0, s3, v0 clamp
+; GFX8-NEXT: s_or_b32 s0, s0, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT: s_lshr_b32 s11, s3, 16
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: v_add_u16_e64 v0, s11, v0 clamp
+; GFX8-NEXT: s_or_b32 s1, s1, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT: v_readfirstlane_b32 s7, v0
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s3, s3, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_uaddsat_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..0b5224b0079b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
; GFX6-LABEL: v_usubsat_i7:
@@ -81,8 +81,9 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 9
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i7:
@@ -91,8 +92,9 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 9
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_usubsat_i7:
@@ -100,8 +102,9 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, 9
; GFX10-NEXT: s_lshl_b32 s1, s1, 9
; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: s_lshr_b32 s0, s0, 9
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_usubsat_i7:
@@ -109,8 +112,9 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 9
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 9
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 9, v0.l
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 9
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_usubsat_i7:
@@ -118,8 +122,9 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 9
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 9
; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 9
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@@ -200,8 +205,9 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i8:
@@ -210,8 +216,9 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_usubsat_i8:
@@ -219,8 +226,9 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 8
; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_usubsat_i8:
@@ -228,8 +236,9 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 8
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_usubsat_i8:
@@ -237,8 +246,9 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
; GFX11-FAKE16-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 8
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@@ -380,15 +390,19 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: s_lshl_b32 s1, s3, 8
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s2, 8
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: s_lshl_b32 s1, s2, 8
+; GFX8-NEXT: s_lshl_b32 s2, s3, 8
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_sub_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s1, s1, 8
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
+; GFX8-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v2i8:
@@ -407,11 +421,17 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
-; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX9-NEXT: s_lshr_b32 s1, s0, 16
+; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: s_lshl_b32 s1, s1, 8
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_usubsat_v2i8:
@@ -428,12 +448,18 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX10-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_lshl_b32 s1, s1, 8
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_usubsat_v2i8:
@@ -451,10 +477,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v0 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s0, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_usubsat_v2i8:
@@ -472,13 +502,17 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 0x80008
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s0, 16
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i16 %lhs.arg to <2 x i8>
%rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -710,18 +744,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX6-NEXT: s_sub_i32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s4, 24
; GFX6-NEXT: s_lshl_b32 s4, s7, 24
-; GFX6-NEXT: s_min_u32 s4, s3, s4
; GFX6-NEXT: s_lshr_b32 s1, s1, 24
+; GFX6-NEXT: s_min_u32 s4, s3, s4
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_lshr_b32 s2, s2, 24
; GFX6-NEXT: s_sub_i32 s3, s3, s4
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24
-; GFX6-NEXT: s_lshl_b32 s0, s2, 16
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX6-NEXT: s_lshl_b32 s0, s3, 24
-; GFX6-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX6-NEXT: s_lshr_b32 s3, s3, 24
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_lshl_b32 s1, s2, 16
+; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: s_lshl_b32 s1, s3, 24
+; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_usubsat_v4i8:
@@ -735,29 +769,37 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: s_lshl_b32 s1, s5, 8
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s2, 8
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
-; GFX8-NEXT: s_lshl_b32 s1, s6, 8
-; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s3, 8
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_lshl_b32 s1, s7, 8
-; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s4, 8
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_lshl_b32 s1, s2, 8
+; GFX8-NEXT: s_lshl_b32 s2, s5, 8
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_sub_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshl_b32 s2, s3, 8
+; GFX8-NEXT: s_lshl_b32 s3, s6, 8
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_sub_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: s_lshl_b32 s3, s4, 8
+; GFX8-NEXT: s_lshl_b32 s4, s7, 8
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: v_sub_u16_e64 v0, s3, v0 clamp
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshr_b32 s1, s1, 8
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
+; GFX8-NEXT: s_lshr_b32 s2, s2, 8
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NEXT: s_lshr_b32 s3, s3, 8
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_lshl_b32 s1, s3, 24
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v4i8:
@@ -787,66 +829,87 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
-; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_pk_sub_u16 v1, s2, v1 clamp
-; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_sub_u16 v0, s2, v0 clamp
+; GFX9-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xffff
+; GFX9-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX9-NEXT: s_lshr_b32 s1, s1, 8
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX9-NEXT: s_and_b32 s2, s0, 0xff
+; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT: s_lshl_b32 s0, s0, 8
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s1, 0xff
+; GFX9-NEXT: s_lshl_b32 s2, s2, 16
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: s_lshl_b32 s1, s1, 24
+; GFX9-NEXT: s_or_b32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_usubsat_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s2, s0, 8
+; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
-; GFX10-NEXT: s_lshr_b32 s5, s1, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
-; GFX10-NEXT: s_lshr_b32 s4, s0, 16
-; GFX10-NEXT: s_lshr_b32 s5, s2, 16
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
-; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5
-; GFX10-NEXT: s_lshr_b32 s4, s1, 16
-; GFX10-NEXT: s_lshr_b32 s5, s3, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX10-NEXT: s_lshl_b32 s4, s4, 8
-; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
-; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
-; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
+; GFX10-NEXT: s_lshr_b32 s5, s4, 16
+; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-NEXT: s_lshl_b32 s4, s4, 0x80008
+; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
-; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp
-; GFX10-NEXT: v_mov_b32_e32 v2, 8
-; GFX10-NEXT: v_mov_b32_e32 v4, 24
-; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s3
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5
+; GFX10-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX10-NEXT: s_lshr_b32 s1, s1, 16
+; GFX10-NEXT: s_lshr_b32 s3, s3, 0x80008
+; GFX10-NEXT: s_lshr_b32 s1, s1, 8
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX10-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: s_lshl_b32 s2, s2, 8
+; GFX10-NEXT: s_and_b32 s3, s1, 0xff
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s2, s3, 16
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT: s_or_b32 s0, s0, s2
+; GFX10-NEXT: s_lshl_b32 s1, s1, 24
+; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_usubsat_v4i8:
@@ -866,30 +929,37 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
; GFX11-TRUE16-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-TRUE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-TRUE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX11-TRUE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
-; GFX11-TRUE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xff, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 24, v3
-; GFX11-TRUE16-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s1, 0xffff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s1, s2, s1
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s0, 8
+; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s1, 16
+; GFX11-TRUE16-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_usubsat_v4i8:
@@ -909,28 +979,39 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s1, 16
-; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
; GFX11-FAKE16-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s4, 8
; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 0x80008
-; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s5, 8
+; GFX11-FAKE16-NEXT: s_lshl_b32 s5, s5, 8
+; GFX11-FAKE16-NEXT: v_pk_sub_u16 v0, s2, s3 clamp
; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s0, s4
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX11-FAKE16-NEXT: v_pk_sub_u16 v1, s0, s1 clamp
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 8
-; GFX11-FAKE16-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v1
-; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 16, 8
-; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xff, v0, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; GFX11-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 0x80008
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 0x80008
+; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s1, s3, s1
+; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s0, 0x80010
+; GFX11-FAKE16-NEXT: s_and_b32 s0, s0, 0xff
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xff
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s3, 16
+; GFX11-FAKE16-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s2
+; GFX11-FAKE16-NEXT: s_lshl_b32 s1, s1, 24
+; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%lhs = bitcast i32 %lhs.arg to <4 x i8>
%rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -996,8 +1077,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_lshr_b32 s0, s0, 8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_i24:
@@ -1006,8 +1087,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_usubsat_i24:
@@ -1015,8 +1096,8 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp
-; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 8
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
ret i24 %result
@@ -2059,15 +2140,18 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
;
; GFX8-LABEL: s_usubsat_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_sub_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v2i16:
@@ -2260,24 +2344,30 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
;
; GFX8-LABEL: s_usubsat_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s4, s0, 16
-; GFX8-NEXT: s_lshr_b32 s6, s2, 16
-; GFX8-NEXT: s_lshr_b32 s5, s1, 16
-; GFX8-NEXT: s_lshr_b32 s7, s3, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
-; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
+; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_sub_u16_e64 v0, s4, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: s_lshr_b32 s7, s3, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: v_sub_u16_e64 v0, s5, v0 clamp
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v4i16:
@@ -2426,33 +2516,42 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
;
; GFX8-LABEL: s_usubsat_v6i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s6, s0, 16
-; GFX8-NEXT: s_lshr_b32 s7, s1, 16
-; GFX8-NEXT: s_lshr_b32 s9, s3, 16
-; GFX8-NEXT: s_lshr_b32 s10, s4, 16
-; GFX8-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NEXT: s_lshr_b32 s11, s5, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s10
-; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mov_b32_e32 v6, s8
+; GFX8-NEXT: s_lshr_b32 s9, s3, 16
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
-; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
-; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
+; GFX8-NEXT: s_lshr_b32 s6, s0, 16
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s9
+; GFX8-NEXT: v_sub_u16_e64 v0, s6, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_lshr_b32 s10, s4, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s7, s1, 16
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_sub_u16_e64 v0, s7, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: s_lshr_b32 s11, s5, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshr_b32 s8, s2, 16
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s11
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s8, v0 clamp
+; GFX8-NEXT: s_or_b32 s0, s0, s3
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s5
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s3, s3, 16
+; GFX8-NEXT: s_or_b32 s2, s2, s3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v6i16:
@@ -2622,42 +2721,54 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
;
; GFX8-LABEL: s_usubsat_v8i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s8, s0, 16
-; GFX8-NEXT: s_lshr_b32 s9, s1, 16
-; GFX8-NEXT: s_lshr_b32 s10, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NEXT: v_sub_u16_e64 v0, s8, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s1, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s9, s1, 16
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NEXT: v_sub_u16_e64 v0, s9, v0 clamp
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
-; GFX8-NEXT: s_lshr_b32 s11, s3, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s2, v0 clamp
+; GFX8-NEXT: s_lshr_b32 s10, s2, 16
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s14
+; GFX8-NEXT: v_sub_u16_e64 v0, s10, v0 clamp
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: v_readfirstlane_b32 s6, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s7
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
-; GFX8-NEXT: v_mov_b32_e32 v1, s12
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: v_mov_b32_e32 v3, s13
-; GFX8-NEXT: v_mov_b32_e32 v4, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s14
-; GFX8-NEXT: v_mov_b32_e32 v6, s10
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: v_mov_b32_e32 v7, s15
-; GFX8-NEXT: v_mov_b32_e32 v8, s11
-; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
-; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
-; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
-; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp
-; GFX8-NEXT: v_sub_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
-; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
-; GFX8-NEXT: v_or_b32_e32 v3, v6, v7
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
+; GFX8-NEXT: v_sub_u16_e64 v0, s3, v0 clamp
+; GFX8-NEXT: s_or_b32 s0, s0, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT: s_lshr_b32 s11, s3, 16
+; GFX8-NEXT: v_readfirstlane_b32 s3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: v_sub_u16_e64 v0, s11, v0 clamp
+; GFX8-NEXT: s_or_b32 s1, s1, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT: v_readfirstlane_b32 s7, v0
+; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s2, s2, s4
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s3, s3, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_usubsat_v8i16:
More information about the llvm-branch-commits
mailing list