[llvm] 99d450e - Revert "[AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (#123942)"
Nico Weber via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 06:20:01 PST 2025
Author: Nico Weber
Date: 2025-01-23T09:19:42-05:00
New Revision: 99d450e9f51683bad608bf801e1b29e5c54b8917
URL: https://github.com/llvm/llvm-project/commit/99d450e9f51683bad608bf801e1b29e5c54b8917
DIFF: https://github.com/llvm/llvm-project/commit/99d450e9f51683bad608bf801e1b29e5c54b8917.diff
LOG: Revert "[AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (#123942)"
This reverts commit 6fdaaafd89d7cbc15dafe3ebf1aa3235d148aaab.
Breaks check-llvm, see
https://github.com/llvm/llvm-project/pull/123942#issuecomment-2609861953
Added:
Modified:
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/idot4u.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/permute_i8.ll
llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
Removed:
llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index bdd164a2f01312..467f042892cebe 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -962,11 +962,8 @@ bool isConvertibleToSDWA(MachineInstr &MI,
const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
- if (TII->isSDWA(Opc)) {
- // FIXME: Reenable after fixing selection handling.
- // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
- return false;
- }
+ if (TII->isSDWA(Opc))
+ return true;
// Check if this instruction has opcode that supports SDWA
if (AMDGPU::getSDWAOp(Opc) == -1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 2d9e8969fdbb52..e289ee759da158 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -280,9 +280,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -300,8 +299,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -441,8 +439,7 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -612,11 +609,9 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index a98b305c15f75c..43ebe156eb2a28 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -281,9 +281,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -301,8 +300,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -442,8 +440,7 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -613,11 +610,9 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 3d7fec9a5986cd..788692c94b0cfa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -224,8 +224,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -330,8 +329,7 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -453,11 +451,9 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -622,20 +618,18 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_lshl_b32 s1, s7, 8
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
-; GFX8-NEXT: s_lshl_b32 s1, s7, 8
-; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0ab16d95b191d9..0042d34e235d17 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -218,8 +218,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -322,8 +321,7 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -441,11 +439,9 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -606,20 +602,18 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_lshl_b32 s1, s7, 8
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
-; GFX8-NEXT: s_lshl_b32 s1, s7, 8
-; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index a969e3d4f4f79b..e8f1619c5d418c 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -6398,10 +6398,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -6627,10 +6625,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7048,9 +7044,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
-; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
; GFX8-NEXT: v_mov_b32_e32 v6, v7
@@ -7396,10 +7390,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -7658,10 +7650,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7925,10 +7915,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -8187,10 +8175,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 72f883928cffbc..ff48a3fc980187 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -14349,10 +14349,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14541,10 +14539,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14747,10 +14743,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14930,10 +14924,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15115,10 +15107,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15318,10 +15308,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15514,10 +15502,8 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15704,10 +15690,8 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15894,10 +15878,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16077,10 +16059,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt
; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16264,10 +16244,8 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16447,10 +16425,8 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__
; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 9c2a76380d83dc..14f75814128f18 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -12094,10 +12094,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12318,10 +12316,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12560,10 +12556,8 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12772,10 +12766,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12986,10 +12978,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13221,10 +13211,8 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13449,10 +13437,8 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13668,10 +13654,8 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 2be6bf302d35f7..ec4ea232e661cf 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -15403,10 +15403,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -15637,10 +15635,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -15871,10 +15867,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16089,10 +16083,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16301,10 +16293,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16514,10 +16504,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16756,10 +16744,8 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16975,10 +16961,8 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17218,10 +17202,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17458,10 +17440,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17686,10 +17666,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17900,10 +17878,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -18142,10 +18118,8 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -18382,10 +18356,8 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1)
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 24791b60bfc6d8..3dbf6477a7cb89 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -12433,10 +12433,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12713,10 +12711,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12993,10 +12989,8 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -13266,10 +13260,8 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -13533,10 +13525,8 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -13801,10 +13791,8 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -14089,10 +14077,8 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -14363,10 +14349,8 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 10fac09ef4ec07..8f82348d350e0a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2518,17 +2518,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v6
-; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
+; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v9
+; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8
; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
@@ -2547,17 +2546,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v6
-; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
+; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v9
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8
; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index e4602f20f8a37c..23b57a7efa586c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -5034,10 +5034,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5259,10 +5257,8 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5478,10 +5474,8 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5694,10 +5688,8 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 967e972e53e290..1b08b64b046b48 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -5532,10 +5532,8 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5789,10 +5787,8 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -6037,10 +6033,8 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -6282,10 +6276,8 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 4e8248d4be14ec..37bf8516403bf5 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -592,8 +592,7 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1
; GFX9-NEXT: global_store_dword v[5:6], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
deleted file mode 100644
index 6eae905278f3ed..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - < %s | FileCheck -check-prefix=CHECK %s
-
-; The si-peephole-sdwa pass has mishandled the selections of preexisting sdwa instructions
-; which led to an instruction of this shape:
-; v_lshlrev_b32_sdwa v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; instead of
-; v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-
-define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) {
-; CHECK-LABEL: widget:
-; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
-; CHECK-NEXT: v_mov_b32_e32 v2, 8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: global_load_ushort v1, v0, s[0:1]
-; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2
-; CHECK-NEXT: s_bitcmp1_b32 s2, 0
-; CHECK-NEXT: s_cselect_b32 s0, -1, 0
-; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
-; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; CHECK-NEXT: s_cbranch_vccz .LBB0_2
-; CHECK-NEXT: ; %bb.1: ; %bb19
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: ds_write_b32 v1, v1
-; CHECK-NEXT: .LBB0_2: ; %bb20
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CHECK-NEXT: s_mov_b32 s0, exec_lo
-; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0
-; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0
-; CHECK-NEXT: s_cbranch_execz .LBB0_4
-; CHECK-NEXT: ; %bb.3: ; %bb11
-; CHECK-NEXT: v_mov_b32_e32 v1, 2
-; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: ds_write_b32 v0, v1 offset:84
-; CHECK-NEXT: .LBB0_4: ; %bb14
-; CHECK-NEXT: s_endpgm
-bb:
- %call = tail call i32 @llvm.amdgcn.workitem.id.x()
- %zext = zext i32 %call to i64
- %getelementptr = getelementptr i8, ptr addrspace(1) %arg, i64 %zext
- %load = load i8, ptr addrspace(1) %getelementptr, align 1
- %or = or disjoint i32 %call, 1
- %zext4 = zext i32 %or to i64
- %getelementptr5 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext4
- %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1
- %or7 = or disjoint i32 %call, 2
- %zext8 = zext i32 %or7 to i64
- %getelementptr9 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext8
- %load10 = load i8, ptr addrspace(1) %getelementptr9, align 1
- br i1 %arg1, label %bb19, label %bb20
-
-bb11: ; preds = %bb20
- %zext12 = zext i8 %load10 to i64
- %getelementptr13 = getelementptr nusw [14 x i32], ptr addrspace(3) inttoptr (i32 84 to ptr addrspace(3)), i64 0, i64 %zext12
- store i32 0, ptr addrspace(3) %getelementptr13, align 4
- br label %bb14
-
-bb14: ; preds = %bb20, %bb11
- %zext15 = zext i8 %load6 to i64
- %getelementptr16 = getelementptr [14 x i32], ptr addrspace(3) %arg2, i64 0, i64 %zext15
- %zext17 = zext i8 %load to i64
- %getelementptr18 = getelementptr [14 x i32], ptr addrspace(3) %arg3, i64 0, i64 %zext17
- ret void
-
-bb19: ; preds = %bb
- store i32 0, ptr addrspace(3) null, align 4
- br label %bb20
-
-bb20: ; preds = %bb19, %bb
- %icmp = icmp eq i8 %load10, 0
- br i1 %icmp, label %bb14, label %bb11
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
deleted file mode 100644
index cc2c8b3940d78b..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
+++ /dev/null
@@ -1,56 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CHECK %s
-
-# Currently the conversions in si-peephole-sdwa are disabled on preexisting sdwa instructions.
-# If they are reenabled, the code matches this pattern instead of the corresponding pattern
-# for V_LSHLREV_B32_sdwa further below:
-# [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, %{{[0-9]+}}, 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 5, implicit $exec
-
-# TODO Implement a fix for the incorrect sdwa selection
-
----
-name: sdwa_opsel_hazard
-body: |
- ; CHECK-LABEL: name: sdwa_opsel_hazard
- ; CHECK: bb.0:
- ; CHECK-NEXT: successors: %bb.2(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
- ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[DEF1]], [[DEF2]], 0, 0, implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.2
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 undef %5, 255, implicit $exec
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
- ; CHECK-NEXT: [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef %5, 0, 6, 0, 6, 0, implicit $exec
- ; CHECK-NEXT: S_ENDPGM 0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.2:
- ; CHECK-NEXT: successors: %bb.1(0x80000000)
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
- ; CHECK-NEXT: S_BRANCH %bb.1
- bb.0:
- successors: %bb.2(0x40000000)
- %0:sreg_32 = IMPLICIT_DEF
- %1:sreg_64_xexec_xnull = IMPLICIT_DEF
- %2:vgpr_32 = IMPLICIT_DEF
- %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %1, %2, 0, 0, implicit $exec
- S_BRANCH %bb.2
-
- bb.1:
- %5:vgpr_32 = V_AND_B32_e64 undef %6, 255, implicit $exec
- %7:vgpr_32 = V_LSHLREV_B32_e64 2, killed undef %5, implicit $exec
- S_ENDPGM 0
-
- bb.2:
- successors: %bb.1(0x40000000)
-
- %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec
-
- S_BRANCH %bb.1
-
-...
-
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
index aaa32d871148bf..62538120f84519 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
@@ -138,8 +138,7 @@ body: |
---
# GCN-LABEL: {{^}}name: vop2_instructions
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index c027600a8af674..e2854df2468b39 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -147,15 +147,14 @@ body: |
---
# GCN-LABEL: {{^}}name: vop2_instructions
-# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec
-# VI: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
+
+# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 467bc77c185779..ffbd2d092b5d81 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -37,10 +37,9 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
- ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec
- ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0)
+ ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
%2 = COPY $sgpr30_sgpr31
@@ -146,7 +145,7 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[FLAT_LOAD_DWORD]], implicit $exec
; SDWA-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 65535
; SDWA-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[FLAT_LOAD_DWORD]], killed [[S_MOV_B32_]], implicit $exec
- ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[V_AND_B32_e64_]](tied-def 0)
+ ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[FLAT_LOAD_DWORD]](tied-def 0)
; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_MOV_B32_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: S_ENDPGM 0
%2 = COPY $sgpr30_sgpr31
@@ -181,17 +180,15 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
; SDWA-NEXT: {{ $}}
; SDWA-NEXT: bb.1:
; SDWA-NEXT: successors: %bb.2(0x80000000)
; SDWA-NEXT: {{ $}}
- ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 6, 0, 1, 3, implicit $mode, implicit $exec
- ; SDWA-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[V_MUL_F32_sdwa]], implicit $exec
+ ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
; SDWA-NEXT: {{ $}}
; SDWA-NEXT: bb.2:
- ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_LSHLREV_B32_e64_]], implicit $exec
- ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0)
+ ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
bb.0:
More information about the llvm-commits
mailing list