[llvm] [AMDGPU][MC] Allow opsel for v_max_i16 etc in GFX10 (PR #143982)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 12 14:56:50 PDT 2025
https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/143982
In GFX10, a number of VOP3 instructions should allow opsel, including V_MAX_I16, V_MAX_U16, V_MIN_I16, V_MIN_U16, V_MUL_LO_U16, V_LSHLREV_B16, V_LSHRREV_B16, and V_ASHRREV_I16.
>From 037d397ad404fd17fcff763e4937ebdfe86f18a4 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 12 Jun 2025 14:48:38 -0700
Subject: [PATCH] [AMDGPU][MC] Allow opsel for v_max_i16 etc in GFX10
In GFX10, a number of VOP3 instructions should allow opsel, including
V_MAX_I16, V_MAX_U16, V_MIN_I16, V_MIN_U16, V_MUL_LO_U16, V_LSHLREV_B16,
V_LSHRREV_B16, and V_ASHRREV_I16.
---
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 4 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 18 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 20 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 22 +-
.../AMDGPU/GlobalISel/insertelement.i8.ll | 48 +--
.../GlobalISel/inst-select-ashr.s16.mir | 10 +-
.../GlobalISel/inst-select-lshr.s16.mir | 10 +-
.../inst-select-pattern-smed3.s16.mir | 42 ++-
.../inst-select-pattern-umed3.s16.mir | 42 ++-
.../AMDGPU/GlobalISel/inst-select-shl.s16.mir | 50 ++-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 6 +-
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 17 +-
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 6 +-
.../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 6 +-
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 49 +--
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 6 +-
llvm/test/CodeGen/AMDGPU/freeze.ll | 17 +-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 317 +++++++++---------
.../AMDGPU/sdwa-peephole-cndmask-fail.ll | 4 +-
llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll | 4 +-
llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll | 4 +-
llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll | 4 +-
llvm/test/MC/AMDGPU/gfx10_asm_vop3.s | 24 ++
.../MC/Disassembler/AMDGPU/gfx10_vop3.txt | 24 ++
24 files changed, 471 insertions(+), 283 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..67fb68a3eee83 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -211,6 +211,10 @@ multiclass VOP2Inst_e64_t16<string opName,
string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
defm NAME : VOP2Inst<opName, P, node, revOp>;
+ let SubtargetPredicate = isGFX10Only in {
+ def _vop3_e64 : VOP3InstBase <opName#"_vop3", VOP3_Profile<P, VOP3_OPSEL>, node, 1>,
+ Commutable_REV<revOp#"_vop3_e64", !eq(revOp, opName)>;
+ }
}
let SubtargetPredicate = UseRealTrue16Insts in {
defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 0252c4f1b0929..597202d47591d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1932,16 +1932,14 @@ defm V_DIV_FIXUP_F16 :
defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>;
defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>;
-// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
-// (they do not support SDWA or DPP).
-defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
-defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
-defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
-defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16", "v_max_u16">;
-defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16", "v_max_i16">;
-defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16", "v_min_u16">;
-defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16", "v_min_i16">;
-defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16", "v_lshlrev_b16">;
+defm V_MUL_LO_U16 : VOP3OpSel_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_vop3", "v_mul_lo_u16">;
+defm V_LSHRREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_vop3", "v_lshrrev_b16">;
+defm V_ASHRREV_I16 : VOP3OpSel_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_vop3", "v_ashrrev_i16">;
+defm V_MAX_U16 : VOP3OpSel_Real_gfx10_with_name<0x309, "V_MAX_U16_vop3", "v_max_u16">;
+defm V_MAX_I16 : VOP3OpSel_Real_gfx10_with_name<0x30a, "V_MAX_I16_vop3", "v_max_i16">;
+defm V_MIN_U16 : VOP3OpSel_Real_gfx10_with_name<0x30b, "V_MIN_U16_vop3", "v_min_u16">;
+defm V_MIN_I16 : VOP3OpSel_Real_gfx10_with_name<0x30c, "V_MIN_I16_vop3", "v_min_i16">;
+defm V_LSHLREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_vop3", "v_lshlrev_b16">;
defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..81153dbefb360 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -864,25 +864,25 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4
; GFX10-NEXT: v_and_b32_e32 v4, 7, v4
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
+; GFX10-NEXT: v_lshlrev_b16 v4, v4, v6
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3
-; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX10-NEXT: v_lshrrev_b16 v1, v5, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 238cc06fc7f7c..c5078c2283203 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -864,25 +864,25 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3
+; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
-; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4
+; GFX10-NEXT: v_lshrrev_b16 v3, v3, v6
+; GFX10-NEXT: v_lshlrev_b16 v4, v5, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
-; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
-; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 1701a9cc7f09b..5874cebe46a37 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
-; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
-; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
@@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
index 4c3f4d9b06ed1..461021112cfef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
@@ -100,7 +100,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
; GFX11-LABEL: name: ashr_s16_s16_vs
; GFX11: liveins: $sgpr0, $vgpr0
@@ -193,7 +193,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
; GFX11-LABEL: name: ashr_s16_s16_vv
; GFX11: liveins: $vgpr0, $vgpr1
@@ -238,7 +238,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_e64_]], implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
@@ -292,7 +292,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec
@@ -442,7 +442,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
; GFX11-LABEL: name: ashr_s16_s16_sv
; GFX11: liveins: $sgpr0, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
index 4769b5f77e3b2..c17b32d5c1676 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
@@ -98,7 +98,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
; GFX11-LABEL: name: lshr_s16_s16_vs
; GFX11: liveins: $sgpr0, $vgpr0
@@ -191,7 +191,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
; GFX11-LABEL: name: lshr_s16_s16_vv
; GFX11: liveins: $vgpr0, $vgpr1
@@ -236,7 +236,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_e64_]], implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
@@ -290,7 +290,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec
@@ -440,7 +440,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
; GFX11-LABEL: name: lshr_s16_s16_sv
; GFX11: liveins: $sgpr0, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
index 19143c52b3f43..db5490ac7b90c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
---
@@ -34,6 +34,15 @@ body: |
; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]]
;
+ ; GFX10-LABEL: name: smed3_s16_vvv
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]]
+ ;
; GFX11-LABEL: name: smed3_s16_vvv
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -88,6 +97,16 @@ body: |
; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]]
;
+ ; GFX10-LABEL: name: smed3_s16_vvv_multiuse0
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MAX_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: smed3_s16_vvv_multiuse0
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -143,6 +162,16 @@ body: |
; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_e64_]]
;
+ ; GFX10-LABEL: name: smed3_s16_vvv_multiuse1
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MIN_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: smed3_s16_vvv_multiuse1
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -199,6 +228,17 @@ body: |
; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]]
;
+ ; GFX10-LABEL: name: smed3_s16_vvv_multiuse2
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MIN_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MAX_I16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_vop3_e64 0, [[V_MIN_I16_vop3_e64_]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: smed3_s16_vvv_multiuse2
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
index b7f48d34b8f96..c3dd6e8e521db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
@@ -1,7 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11 %s
---
@@ -34,6 +34,15 @@ body: |
; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]]
;
+ ; GFX10-LABEL: name: umed3_s16_vvv
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]]
+ ;
; GFX11-LABEL: name: umed3_s16_vvv
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -88,6 +97,16 @@ body: |
; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]]
;
+ ; GFX10-LABEL: name: umed3_s16_vvv_multiuse0
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MAX_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: umed3_s16_vvv_multiuse0
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -143,6 +162,16 @@ body: |
; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_e64_]]
;
+ ; GFX10-LABEL: name: umed3_s16_vvv_multiuse1
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MIN_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: umed3_s16_vvv_multiuse1
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -199,6 +228,17 @@ body: |
; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]]
;
+ ; GFX10-LABEL: name: umed3_s16_vvv_multiuse2
+ ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; GFX10-NEXT: {{ $}}
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX10-NEXT: [[V_MIN_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_vop3_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MAX_U16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_vop3_e64 0, [[V_MIN_U16_vop3_e64_]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: umed3_s16_vvv_multiuse2
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
index 73f164ed10df1..632b68fe80b2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
@@ -36,6 +36,7 @@ body: |
; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; GFX8-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX9-LABEL: name: shl_s16_s16_ss
; GFX9: liveins: $sgpr0, $sgpr1
; GFX9-NEXT: {{ $}}
@@ -45,6 +46,7 @@ body: |
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; GFX9-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX10-LABEL: name: shl_s16_s16_ss
; GFX10: liveins: $sgpr0, $sgpr1
; GFX10-NEXT: {{ $}}
@@ -54,6 +56,7 @@ body: |
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX11-LABEL: name: shl_s16_s16_ss
; GFX11: liveins: $sgpr0, $sgpr1
; GFX11-NEXT: {{ $}}
@@ -86,6 +89,7 @@ body: |
; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX9-LABEL: name: shl_s16_s16_vs
; GFX9: liveins: $sgpr0, $vgpr0
; GFX9-NEXT: {{ $}}
@@ -93,13 +97,15 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX10-LABEL: name: shl_s16_s16_vs
; GFX10: liveins: $sgpr0, $vgpr0
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: shl_s16_s16_vs
; GFX11: liveins: $sgpr0, $vgpr0
; GFX11-NEXT: {{ $}}
@@ -132,6 +138,7 @@ body: |
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX9-LABEL: name: shl_s16_s32_vv
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -140,6 +147,7 @@ body: |
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX10-LABEL: name: shl_s16_s32_vv
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -148,6 +156,7 @@ body: |
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX11-LABEL: name: shl_s16_s32_vv
; GFX11: liveins: $vgpr0, $vgpr1
; GFX11-NEXT: {{ $}}
@@ -179,6 +188,7 @@ body: |
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX9-LABEL: name: shl_s16_s16_vv
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -186,13 +196,15 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX10-LABEL: name: shl_s16_s16_vv
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: shl_s16_s16_vv
; GFX11: liveins: $vgpr0, $vgpr1
; GFX11-NEXT: {{ $}}
@@ -224,6 +236,7 @@ body: |
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX9-LABEL: name: shl_s16_s16_vv_zext_to_s32
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -231,15 +244,17 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX10-LABEL: name: shl_s16_s16_vv_zext_to_s32
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_e64_]], implicit $exec
+ ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_vop3_e64_]], implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ;
; GFX11-LABEL: name: shl_s16_s16_vv_zext_to_s32
; GFX11: liveins: $vgpr0, $vgpr1
; GFX11-NEXT: {{ $}}
@@ -276,6 +291,7 @@ body: |
; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+ ;
; GFX9-LABEL: name: shl_s16_vv_zext_to_s64
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -285,18 +301,20 @@ body: |
; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_LSHLREV_B16_e64_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1
; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+ ;
; GFX10-LABEL: name: shl_s16_vv_zext_to_s64
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
- ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec
+ ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_vop3_e64_]], implicit $exec
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
+ ;
; GFX11-LABEL: name: shl_s16_vv_zext_to_s64
; GFX11: liveins: $vgpr0, $vgpr1
; GFX11-NEXT: {{ $}}
@@ -335,6 +353,7 @@ body: |
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX8-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX9-LABEL: name: shl_s16_s32_ss
; GFX9: liveins: $sgpr0, $sgpr1
; GFX9-NEXT: {{ $}}
@@ -343,6 +362,7 @@ body: |
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX9-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX10-LABEL: name: shl_s16_s32_ss
; GFX10: liveins: $sgpr0, $sgpr1
; GFX10-NEXT: {{ $}}
@@ -351,6 +371,7 @@ body: |
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX10-NEXT: [[SHL:%[0-9]+]]:sgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX11-LABEL: name: shl_s16_s32_ss
; GFX11: liveins: $sgpr0, $sgpr1
; GFX11-NEXT: {{ $}}
@@ -382,6 +403,7 @@ body: |
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX9-LABEL: name: shl_s16_s32_sv
; GFX9: liveins: $sgpr0, $vgpr0
; GFX9-NEXT: {{ $}}
@@ -390,6 +412,7 @@ body: |
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX10-LABEL: name: shl_s16_s32_sv
; GFX10: liveins: $sgpr0, $vgpr0
; GFX10-NEXT: {{ $}}
@@ -398,6 +421,7 @@ body: |
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX11-LABEL: name: shl_s16_s32_sv
; GFX11: liveins: $sgpr0, $vgpr0
; GFX11-NEXT: {{ $}}
@@ -428,6 +452,7 @@ body: |
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX9-LABEL: name: shl_s16_s16_sv
; GFX9: liveins: $sgpr0, $vgpr0
; GFX9-NEXT: {{ $}}
@@ -435,13 +460,15 @@ body: |
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
; GFX9-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ;
; GFX10-LABEL: name: shl_s16_s16_sv
; GFX10: liveins: $sgpr0, $vgpr0
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
+ ; GFX10-NEXT: [[V_LSHLREV_B16_vop3_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+ ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHLREV_B16_vop3_e64_]]
+ ;
; GFX11-LABEL: name: shl_s16_s16_sv
; GFX11: liveins: $sgpr0, $vgpr0
; GFX11-NEXT: {{ $}}
@@ -473,6 +500,7 @@ body: |
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX8-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX9-LABEL: name: shl_s16_s32_vs
; GFX9: liveins: $sgpr0, $vgpr0
; GFX9-NEXT: {{ $}}
@@ -481,6 +509,7 @@ body: |
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX9-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX10-LABEL: name: shl_s16_s32_vs
; GFX10: liveins: $sgpr0, $vgpr0
; GFX10-NEXT: {{ $}}
@@ -489,6 +518,7 @@ body: |
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[COPY1]](s32)
; GFX10-NEXT: S_ENDPGM 0, implicit [[SHL]](s16)
+ ;
; GFX11-LABEL: name: shl_s16_s32_vs
; GFX11: liveins: $sgpr0, $vgpr0
; GFX11-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 832f066adaa84..a9397b3c33b88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -315,7 +315,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -458,7 +459,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 02f8d0bf3c3df..e5ec9e48b9a63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -2579,26 +2579,27 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2
+; GFX10-NEXT: s_xor_b32 s1, s11, s2
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3
-; GFX10-NEXT: s_xor_b32 s1, s11, s2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s3, v3
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
; GFX10-NEXT: s_xor_b32 s0, s12, s10
-; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
-; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
-; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
+; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
+; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
+; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 2673ac4fb5bae..4a58a6ae62657 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -316,7 +316,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -459,7 +460,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..18938b8afc353 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -315,7 +315,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -439,7 +440,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 1aaf3122cc00d..c5a3800b2d6ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -2032,7 +2032,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
;
; GFX10-LABEL: udivrem_v2i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x10
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, s0
; GFX10-NEXT: s_bfe_u32 s1, s0, 0x80010
@@ -2040,17 +2042,16 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s1
; GFX10-NEXT: s_sub_i32 s3, 0, s2
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT: s_sub_i32 s4, 0, s1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
+; GFX10-NEXT: s_sub_i32 s3, 0, s1
+; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1
; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008
; GFX10-NEXT: s_and_b32 s0, s0, 0xff
-; GFX10-NEXT: v_mul_lo_u32 v3, s4, v1
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
@@ -2060,34 +2061,34 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
-; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
-; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
-; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
-; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
+; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0
-; GFX10-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
+; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v0
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
+; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v1
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
+; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3
+; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_short v1, v0, s[4:5]
; GFX10-NEXT: global_store_short v1, v2, s[6:7]
; GFX10-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..c3c3eb3165167 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -309,7 +309,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -431,7 +432,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index 2040aedc250e6..98ff6214da3f8 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -12294,10 +12294,10 @@ define void @freeze_v2i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-GISEL-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -12485,13 +12485,14 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0xff
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX10-GISEL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-GISEL-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-GISEL-NEXT: global_store_short v[2:3], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index b4e5fa088b533..7111ef037897e 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -536,7 +536,8 @@ define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -993,17 +994,18 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
-; GFX10-NEXT: v_mov_b32_e32 v0, 2
-; GFX10-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-NEXT: v_mov_b32_e32 v0, 1
+; GFX10-NEXT: v_mov_b32_e32 v1, 2
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v2, 2, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9
-; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: global_store_dword v[7:8], v1, off
@@ -1060,18 +1062,18 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: v_mov_b32_e32 v2, 26
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8
-; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_ashrrev_i32_e32 v2, 26, v9
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 25, v9
; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0
; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707
; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: global_store_dword v[5:6], v1, off
@@ -1233,16 +1235,16 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: global_load_dword v10, v[2:3], off
-; GFX10-NEXT: v_mov_b32_e32 v0, 16
-; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
; GFX10-NEXT: v_lshlrev_b16 v2, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
+; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: global_store_dword v[7:8], v1, off
@@ -1294,24 +1296,24 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: global_load_dword v9, v[2:3], off
-; GFX10-NEXT: v_mov_b32_e32 v0, 26
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
+; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 26, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4
+; GFX10-NEXT: v_lshrrev_b16 v1, 1, v9
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v9
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1
; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707
+; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x1030707
; GFX10-NEXT: global_store_dword v[5:6], v0, off
; GFX10-NEXT: global_store_dword v[7:8], v1, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1432,6 +1434,7 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
+; GFX10-NEXT: v_bfrev_b32_e32 v10, 4.0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
@@ -1439,16 +1442,17 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
-; GFX10-NEXT: v_mov_b32_e32 v0, 16
-; GFX10-NEXT: v_bfrev_b32_e32 v2, 4.0
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9
+; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT: v_or_b32_e32 v0, v2, v0
; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005
; GFX10-NEXT: global_store_dword v[5:6], v0, off
@@ -1508,59 +1512,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12
-; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12
; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
-; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15
-; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16
+; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3
+; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16
+; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18
+; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15
+; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13
+; GFX10-NEXT: v_or_b32_e32 v3, 1, v3
+; GFX10-NEXT: v_trunc_f32_e32 v16, v16
+; GFX10-NEXT: v_trunc_f32_e32 v18, v18
; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17
-; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
; GFX10-NEXT: v_trunc_f32_e32 v15, v15
-; GFX10-NEXT: v_trunc_f32_e32 v16, v16
-; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18
+; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
+; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19
+; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1
+; GFX10-NEXT: v_or_b32_e32 v13, 1, v13
; GFX10-NEXT: v_trunc_f32_e32 v17, v17
-; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11
; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2
-; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19
-; GFX10-NEXT: v_or_b32_e32 v3, 1, v3
-; GFX10-NEXT: v_trunc_f32_e32 v18, v18
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
+; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11
+; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
-; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13
+; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16
+; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
; GFX10-NEXT: v_or_b32_e32 v11, 1, v11
-; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1
+; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15
-; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
-; GFX10-NEXT: v_or_b32_e32 v13, 1, v13
-; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16
; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
-; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12|
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v16, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v3, v18, v10
; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0
-; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo
-; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706
; GFX10-NEXT: global_store_dword v[5:6], v0, off
@@ -1861,70 +1867,72 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3
-; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15
-; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v12
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v14
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v1
; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
-; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1
+; GFX10-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v2
; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
-; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17
-; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18
-; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19
-; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11
-; GFX10-NEXT: v_or_b32_e32 v1, 1, v1
-; GFX10-NEXT: v_trunc_f32_e32 v17, v17
-; GFX10-NEXT: v_trunc_f32_e32 v18, v18
+; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
+; GFX10-NEXT: v_ashrrev_i32_e32 v10, 30, v10
+; GFX10-NEXT: v_mul_f32_e32 v18, v11, v18
; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20
-; GFX10-NEXT: v_trunc_f32_e32 v19, v19
-; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14
-; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3
-; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12
-; GFX10-NEXT: v_or_b32_e32 v11, 1, v11
-; GFX10-NEXT: v_trunc_f32_e32 v20, v20
-; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2|
+; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17
; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16
-; GFX10-NEXT: v_or_b32_e32 v14, 1, v14
-; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21
-; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
-; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13|
+; GFX10-NEXT: v_or_b32_e32 v10, 1, v10
+; GFX10-NEXT: v_trunc_f32_e32 v18, v18
+; GFX10-NEXT: v_trunc_f32_e32 v20, v20
+; GFX10-NEXT: v_mul_f32_e32 v19, v14, v19
+; GFX10-NEXT: v_trunc_f32_e32 v17, v17
+; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0
+; GFX10-NEXT: v_mad_f32 v11, -v18, v12, v11
+; GFX10-NEXT: v_mad_f32 v21, -v20, v14, v21
; GFX10-NEXT: v_or_b32_e32 v16, 1, v16
+; GFX10-NEXT: v_trunc_f32_e32 v19, v19
+; GFX10-NEXT: v_mad_f32 v22, -v17, v1, v2
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v11|, |v12|
+; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13
+; GFX10-NEXT: v_or_b32_e32 v0, 1, v0
+; GFX10-NEXT: v_mad_f32 v23, -v19, v2, v14
; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
+; GFX10-NEXT: v_or_b32_e32 v13, 1, v13
; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20
-; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3|
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15|
-; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2
-; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 24, v4
; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo
-; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10
-; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12
-; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1|
+; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17
+; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v2|
+; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v10
+; GFX10-NEXT: v_add_nc_u32_e32 v10, v20, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v0, v17, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v13, vcc_lo
+; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
+; GFX10-NEXT: v_mul_lo_u32 v3, v10, v15
+; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, v19, v1
+; GFX10-NEXT: v_sub_nc_u32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-NEXT: v_mul_lo_u32 v1, v1, v11
+; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, v11, v0
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, v15, v1
+; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306
; GFX10-NEXT: global_store_dword v[5:6], v0, off
@@ -2149,24 +2157,25 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: global_load_dword v9, v[2:3], off
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
+; GFX10-NEXT: global_load_dword v9, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v0, 1
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
-; GFX10-NEXT: v_lshlrev_b16 v2, 2, v0
+; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 3, v9
+; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_lshlrev_b16 v1, 2, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_lshlrev_b16 v1, 3, v4
+; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x50205
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205
; GFX10-NEXT: v_and_b32_e32 v0, 15, v0
; GFX10-NEXT: global_store_byte v[7:8], v0, off
; GFX10-NEXT: global_store_dword v[5:6], v1, off
@@ -2416,51 +2425,53 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2
-; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2
+; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3
-; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10
; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11
-; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12
; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13
+; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12
; GFX10-NEXT: v_trunc_f32_e32 v10, v10
; GFX10-NEXT: v_trunc_f32_e32 v11, v11
-; GFX10-NEXT: v_trunc_f32_e32 v12, v12
; GFX10-NEXT: v_trunc_f32_e32 v13, v13
+; GFX10-NEXT: v_trunc_f32_e32 v12, v12
; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10
; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
-; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12
; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
+; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13
+; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16
; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
+; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17
+; GFX10-NEXT: v_sub_nc_u32_e32 v3, v16, v3
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
-; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14
-; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17
+; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4
-; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505
; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: global_store_dword v[5:6], v1, off
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
index 1c2d07c2f7af5..2d1c0af14ca37 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll
@@ -25,11 +25,11 @@ define void @quux(i32 %arg, i1 %arg1, i1 %arg2) {
; CHECK-NEXT: v_mov_b32_e32 v2, 0xffff
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_and_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; CHECK-NEXT: v_mov_b32_e32 v1, 24
; CHECK-NEXT: v_mov_b32_e32 v2, 0xff
; CHECK-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
-; CHECK-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v0
; CHECK-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; CHECK-NEXT: v_lshlrev_b16 v1, 8, v1
; CHECK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; CHECK-NEXT: .LBB0_2: ; %bb9
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
index 801324eec454e..03226d8df2d94 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
@@ -63,8 +63,10 @@ define i8 @test_vector_reduce_and_v2i8(<2 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_and_v2i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshrrev_b16 v2, 8, v1
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_and_b32_sdwa v1, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index bdb1c22ce7267..0830f6957f03f 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -64,8 +64,10 @@ define i8 @test_vector_reduce_or_v2i8(<2 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_or_v2i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshrrev_b16 v2, 8, v1
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_or_b32_sdwa v1, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-SDAG-NEXT: v_or_b32_e32 v2, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
index cf344ea9b92d4..a25138b53aa74 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
@@ -63,8 +63,10 @@ define i8 @test_vector_reduce_xor_v2i8(<2 x i8> %v) {
; GFX10-SDAG-LABEL: test_vector_reduce_xor_v2i8:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshrrev_b16 v2, 8, v1
; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT: v_xor_b32_sdwa v1, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v1, v2
+; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
index c151bf99b76c5..6bb0f4b1dff2d 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
@@ -8974,6 +8974,9 @@ v_mul_lo_u16 v5, v1, 0.5
v_mul_lo_u16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x05,0xd7,0x01,0xef,0x01,0x00]
+v_mul_lo_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x05,0xd7,0x01,0x05,0x02,0x00]
+
v_lshrrev_b16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x07,0xd7,0x01,0x05,0x02,0x00]
@@ -9052,6 +9055,9 @@ v_lshrrev_b16 v5, v1, 0.5
v_lshrrev_b16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x07,0xd7,0x01,0xef,0x01,0x00]
+v_lshrrev_b16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x07,0xd7,0x01,0x05,0x02,0x00]
+
v_ashrrev_i16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x08,0xd7,0x01,0x05,0x02,0x00]
@@ -9130,6 +9136,9 @@ v_ashrrev_i16 v5, v1, 0.5
v_ashrrev_i16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x08,0xd7,0x01,0xef,0x01,0x00]
+v_ashrrev_i16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x08,0xd7,0x01,0x05,0x02,0x00]
+
v_max_u16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x09,0xd7,0x01,0x05,0x02,0x00]
@@ -9208,6 +9217,9 @@ v_max_u16 v5, v1, 0.5
v_max_u16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x09,0xd7,0x01,0xef,0x01,0x00]
+v_max_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x09,0xd7,0x01,0x05,0x02,0x00]
+
v_max_i16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x0a,0xd7,0x01,0x05,0x02,0x00]
@@ -9286,6 +9298,9 @@ v_max_i16 v5, v1, 0.5
v_max_i16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x0a,0xd7,0x01,0xef,0x01,0x00]
+v_max_i16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x0a,0xd7,0x01,0x05,0x02,0x00]
+
v_min_u16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x0b,0xd7,0x01,0x05,0x02,0x00]
@@ -9364,6 +9379,9 @@ v_min_u16 v5, v1, 0.5
v_min_u16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x0b,0xd7,0x01,0xef,0x01,0x00]
+v_min_u16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x0b,0xd7,0x01,0x05,0x02,0x00]
+
v_min_i16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x0c,0xd7,0x01,0x05,0x02,0x00]
@@ -9442,6 +9460,9 @@ v_min_i16 v5, v1, 0.5
v_min_i16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x0c,0xd7,0x01,0xef,0x01,0x00]
+v_min_i16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x0c,0xd7,0x01,0x05,0x02,0x00]
+
v_add_nc_i16 v5, v1, v2
// GFX10: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
@@ -10009,6 +10030,9 @@ v_lshlrev_b16 v5, v1, 0.5
v_lshlrev_b16 v5, v1, -4.0
// GFX10: encoding: [0x05,0x00,0x14,0xd7,0x01,0xef,0x01,0x00]
+v_lshlrev_b16 v5, v1, v2 op_sel:[1,1,1]
+// GFX10: encoding: [0x05,0x58,0x14,0xd7,0x01,0x05,0x02,0x00]
+
v_mad_u16 v5, 0, v2, v3
// GFX10: encoding: [0x05,0x00,0x40,0xd7,0x80,0x04,0x0e,0x04]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
index 6da1423fe8278..721babdd64245 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt
@@ -1503,6 +1503,9 @@
# GFX10: v_ashrrev_i16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x08,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x08,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_ashrrev_i16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x08,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x08,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_ashrrev_i32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x18,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x18,0xd5,0x01,0x05,0x02,0x00
@@ -8309,6 +8312,9 @@
# GFX10: v_lshlrev_b16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x14,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x14,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_lshlrev_b16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x14,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x14,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_lshlrev_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1a,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x1a,0xd5,0x01,0x05,0x02,0x00
@@ -8537,6 +8543,9 @@
# GFX10: v_lshrrev_b16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x07,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x07,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_lshrrev_b16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x07,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x07,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_lshrrev_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x16,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x16,0xd5,0x01,0x05,0x02,0x00
@@ -11292,6 +11301,9 @@
# GFX10: v_max_i16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x0a,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x0a,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_max_i16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0a,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x0a,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_max_i32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x12,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x12,0xd5,0x01,0x05,0x02,0x00
@@ -11448,6 +11460,9 @@
# GFX10: v_max_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x09,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x09,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_max_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x09,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x09,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_max_u32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x14,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x14,0xd5,0x01,0x05,0x02,0x00
@@ -13728,6 +13743,9 @@
# GFX10: v_min_i16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x0c,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x0c,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_min_i16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0c,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x0c,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_min_i32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x11,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x11,0xd5,0x01,0x05,0x02,0x00
@@ -13884,6 +13902,9 @@
# GFX10: v_min_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x0b,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x0b,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_min_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0b,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x0b,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_min_u32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x13,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x13,0xd5,0x01,0x05,0x02,0x00
@@ -15228,6 +15249,9 @@
# GFX10: v_mul_lo_u16 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0x05,0xd7,0x6a,0x04,0x02,0x00]
0x05,0x00,0x05,0xd7,0x6a,0x04,0x02,0x00
+# GFX10: v_mul_lo_u16 v5, v1, v2 op_sel:[1,1,1] ; encoding: [0x05,0x58,0x05,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x58,0x05,0xd7,0x01,0x05,0x02,0x00
+
# GFX10: v_mul_lo_u32 v255, v1, v2 ; encoding: [0xff,0x00,0x69,0xd5,0x01,0x05,0x02,0x00]
0xff,0x00,0x69,0xd5,0x01,0x05,0x02,0x00
More information about the llvm-commits
mailing list