[llvm] [AMDGPU] SIPeepholeSDWA: Disable on existing SDWA instructions (PR #124131)
Frederik Harwath via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 07:22:18 PST 2025
https://github.com/frederik-h created https://github.com/llvm/llvm-project/pull/124131
This PR reapplies the changes from PR #123942 which had to be reverted because of a test failure. The test has been adjusted.
>From 6e1e3f8a0a23f9a90b1a4b41441e1910c9b29e54 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <frederik.harwath at amd.com>
Date: Thu, 23 Jan 2025 14:32:01 +0100
Subject: [PATCH 1/2] [AMDGPU] SIPeepholeSDWA: Disable on existing SDWA
instructions (#123942)
This is meant as a short-term workaround for an invalid conversion in
this pass that occurs because existing SDWA selections are not correctly
taken into account during the conversion.
See the draft PR #123221 for an attempt to fix the actual issue.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 7 +-
.../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 15 ++--
.../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 15 ++--
.../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 26 +++---
.../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 26 +++---
.../buffer-fat-pointer-atomicrmw-fadd.ll | 28 ++++--
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 48 +++++++---
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 32 +++++--
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 56 +++++++++---
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 32 +++++--
llvm/test/CodeGen/AMDGPU/idot4u.ll | 22 ++---
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 16 +++-
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 16 +++-
llvm/test/CodeGen/AMDGPU/permute_i8.ll | 3 +-
.../AMDGPU/sdwa-peephole-instr-combine-sel.ll | 87 +++++++++++++++++++
.../sdwa-peephole-instr-combine-sel.mir | 56 ++++++++++++
.../AMDGPU/sdwa-peephole-instr-gfx10.mir | 3 +-
.../CodeGen/AMDGPU/sdwa-peephole-instr.mir | 7 +-
llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 15 ++--
19 files changed, 400 insertions(+), 110 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 467f042892cebe..bdd164a2f01312 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -962,8 +962,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
- if (TII->isSDWA(Opc))
- return true;
+ if (TII->isSDWA(Opc)) {
+ // FIXME: Reenable after fixing selection handling.
+ // Cf. llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
+ return false;
+ }
// Check if this instruction has opcode that supports SDWA
if (AMDGPU::getSDWAOp(Opc) == -1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index e289ee759da158..2d9e8969fdbb52 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -280,8 +280,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX8-NEXT: v_min_i16_e32 v1, v2, v1
; GFX8-NEXT: v_add_u16_e32 v1, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -299,7 +300,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -439,7 +441,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -609,9 +612,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 43ebe156eb2a28..a98b305c15f75c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -281,8 +281,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX8-NEXT: v_min_i16_e32 v1, v1, v4
; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1
; GFX8-NEXT: v_mov_b32_e32 v2, 0xff
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -300,7 +301,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -440,7 +442,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -610,9 +613,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 788692c94b0cfa..3d7fec9a5986cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -224,7 +224,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -329,7 +330,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -451,9 +453,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -618,18 +622,20 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_lshl_b32 s1, s7, 8
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s4, 8
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_lshl_b32 s1, s7, 8
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
+; GFX8-NEXT: s_lshl_b32 s0, s4, 8
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 0042d34e235d17..0ab16d95b191d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -218,7 +218,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -321,7 +322,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -439,9 +441,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -602,18 +606,20 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: s_lshl_b32 s1, s7, 8
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
-; GFX8-NEXT: s_lshl_b32 s0, s4, 8
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_lshl_b32 s1, s7, 8
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
+; GFX8-NEXT: s_lshl_b32 s0, s4, 8
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index e8f1619c5d418c..a969e3d4f4f79b 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -6398,8 +6398,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -6625,8 +6627,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7044,7 +7048,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX8-NEXT: ; =>This Loop Header: Depth=1
; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v8
+; GFX8-NEXT: v_add_f16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
; GFX8-NEXT: v_mov_b32_e32 v6, v7
@@ -7390,8 +7396,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -7650,8 +7658,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
@@ -7915,8 +7925,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
@@ -8175,8 +8187,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_add_f16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index ff48a3fc980187..72f883928cffbc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -14349,8 +14349,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14539,8 +14541,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14743,8 +14747,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14924,8 +14930,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15107,8 +15115,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15308,8 +15318,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15502,8 +15514,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15690,8 +15704,10 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_
; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15878,8 +15894,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16059,8 +16077,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt
; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16244,8 +16264,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16425,8 +16447,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__
; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index 14f75814128f18..9c2a76380d83dc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -12094,8 +12094,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12316,8 +12318,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12556,8 +12560,10 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12766,8 +12772,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12978,8 +12986,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13211,8 +13221,10 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13437,8 +13449,10 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13654,8 +13668,10 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index ec4ea232e661cf..2be6bf302d35f7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -15403,8 +15403,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -15635,8 +15637,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -15867,8 +15871,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16083,8 +16089,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16293,8 +16301,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16504,8 +16514,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16744,8 +16756,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -16961,8 +16975,10 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin
; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17202,8 +17218,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17440,8 +17458,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17666,8 +17686,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -17878,8 +17900,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -18118,8 +18142,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -18356,8 +18382,10 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1)
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 3dbf6477a7cb89..24791b60bfc6d8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -12433,8 +12433,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12711,8 +12713,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -12989,8 +12993,10 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -13260,8 +13266,10 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -13525,8 +13533,10 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -13791,8 +13801,10 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -14077,8 +14089,10 @@ define <2 x half> @global_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrsp
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_sub_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -14349,8 +14363,10 @@ define void @global_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 8f82348d350e0a..10fac09ef4ec07 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2518,16 +2518,17 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
-; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
-; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
+; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v6
+; GFX9-NODL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NODL-NEXT: v_or_b32_e32 v6, v7, v6
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v8
+; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v9
; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
@@ -2546,16 +2547,17 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
-; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v4, v5
-; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v6
-; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v4, v5
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v6
+; GFX9-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6
; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1
-; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8
+; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v9
; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 23b57a7efa586c..e4602f20f8a37c 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -5034,8 +5034,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v3, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5257,8 +5259,10 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_add_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_add_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v3, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5474,8 +5478,10 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5688,8 +5694,10 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_add_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_add_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 1b08b64b046b48..967e972e53e290 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -5532,8 +5532,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5787,8 +5789,10 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, v2
-; GFX8-NEXT: v_sub_f16_sdwa v2, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_sub_f16_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v3, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -6033,8 +6037,10 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -6276,8 +6282,10 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_sub_f16_sdwa v3, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_sub_f16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_sub_f16_e32 v4, v2, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX8-NEXT: v_or_b32_e32 v3, v4, v3
; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 37bf8516403bf5..4e8248d4be14ec 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -592,7 +592,8 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
; GFX9-NEXT: global_store_dword v[5:6], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
new file mode 100644
index 00000000000000..6eae905278f3ed
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -o - < %s | FileCheck -check-prefix=CHECK %s
+
+; The si-peephole-sdwa pass has mishandled the selections of preexisting sdwa instructions
+; which led to an instruction of this shape:
+; v_lshlrev_b32_sdwa v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; instead of
+; v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @widget(ptr addrspace(1) %arg, i1 %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) {
+; CHECK-LABEL: widget:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT: s_load_dword s2, s[8:9], 0x8
+; CHECK-NEXT: v_mov_b32_e32 v2, 8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_ushort v1, v0, s[0:1]
+; CHECK-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2
+; CHECK-NEXT: s_bitcmp1_b32 s2, 0
+; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s0
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; CHECK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; CHECK-NEXT: s_cbranch_vccz .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %bb19
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: ds_write_b32 v1, v1
+; CHECK-NEXT: .LBB0_2: ; %bb20
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CHECK-NEXT: s_mov_b32 s0, exec_lo
+; CHECK-NEXT: v_cmpx_ne_u16_e32 0, v0
+; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0
+; CHECK-NEXT: s_cbranch_execz .LBB0_4
+; CHECK-NEXT: ; %bb.3: ; %bb11
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: ds_write_b32 v0, v1 offset:84
+; CHECK-NEXT: .LBB0_4: ; %bb14
+; CHECK-NEXT: s_endpgm
+bb:
+ %call = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %zext = zext i32 %call to i64
+ %getelementptr = getelementptr i8, ptr addrspace(1) %arg, i64 %zext
+ %load = load i8, ptr addrspace(1) %getelementptr, align 1
+ %or = or disjoint i32 %call, 1
+ %zext4 = zext i32 %or to i64
+ %getelementptr5 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext4
+ %load6 = load i8, ptr addrspace(1) %getelementptr5, align 1
+ %or7 = or disjoint i32 %call, 2
+ %zext8 = zext i32 %or7 to i64
+ %getelementptr9 = getelementptr i8, ptr addrspace(1) %arg, i64 %zext8
+ %load10 = load i8, ptr addrspace(1) %getelementptr9, align 1
+ br i1 %arg1, label %bb19, label %bb20
+
+bb11: ; preds = %bb20
+ %zext12 = zext i8 %load10 to i64
+ %getelementptr13 = getelementptr nusw [14 x i32], ptr addrspace(3) inttoptr (i32 84 to ptr addrspace(3)), i64 0, i64 %zext12
+ store i32 0, ptr addrspace(3) %getelementptr13, align 4
+ br label %bb14
+
+bb14: ; preds = %bb20, %bb11
+ %zext15 = zext i8 %load6 to i64
+ %getelementptr16 = getelementptr [14 x i32], ptr addrspace(3) %arg2, i64 0, i64 %zext15
+ %zext17 = zext i8 %load to i64
+ %getelementptr18 = getelementptr [14 x i32], ptr addrspace(3) %arg3, i64 0, i64 %zext17
+ ret void
+
+bb19: ; preds = %bb
+ store i32 0, ptr addrspace(3) null, align 4
+ br label %bb20
+
+bb20: ; preds = %bb19, %bb
+ %icmp = icmp eq i8 %load10, 0
+ br i1 %icmp, label %bb14, label %bb11
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
new file mode 100644
index 00000000000000..cc2c8b3940d78b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-combine-sel.mir
@@ -0,0 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CHECK %s
+
+# Currently the conversions in si-peephole-sdwa are disabled on preexisting sdwa instructions.
+# If they are reenabled, the code matches this pattern instead of the corresponding pattern
+# for V_LSHLREV_B32_sdwa further below:
+# [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, %{{[0-9]+}}, 0, undef [[GLOBAL_LOAD_DWORD_SADDR]], 0, 6, 0, 6, 5, implicit $exec
+
+# TODO Implement a fix for the incorrect sdwa selection
+
+---
+name: sdwa_opsel_hazard
+body: |
+ ; CHECK-LABEL: name: sdwa_opsel_hazard
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[DEF1]], [[DEF2]], 0, 0, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 undef %5, 255, implicit $exec
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+ ; CHECK-NEXT: [[V_LSHLREV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef %5, 0, 6, 0, 6, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[GLOBAL_LOAD_DWORD_SADDR]], implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ bb.0:
+ successors: %bb.2(0x40000000)
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:sreg_64_xexec_xnull = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed %1, %2, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ %5:vgpr_32 = V_AND_B32_e64 undef %6, 255, implicit $exec
+ %7:vgpr_32 = V_LSHLREV_B32_e64 2, killed undef %5, implicit $exec
+ S_ENDPGM 0
+
+ bb.2:
+ successors: %bb.1(0x40000000)
+
+ %6:vgpr_32 = V_LSHRREV_B32_e64 16, undef %3, implicit $exec
+
+ S_BRANCH %bb.1
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
index 62538120f84519..aaa32d871148bf 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
@@ -138,7 +138,8 @@ body: |
---
# GCN-LABEL: {{^}}name: vop2_instructions
-# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
index e2854df2468b39..c027600a8af674 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -147,14 +147,15 @@ body: |
---
# GCN-LABEL: {{^}}name: vop2_instructions
-
-# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec
+# VI: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit $mode, implicit $exec
# VI: %{{[0-9]+}}:vgpr_32 = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
-# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit $exec
+# GFX9: %{{[0-9]+}}:vgpr_32 = V_LSHLREV_B32_e64 16, %{{[0-9]+}}, implicit $exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $mode, implicit $exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $mode, implicit $exec
# GFX9: %{{[0-9]+}}:vgpr_32 = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index ffbd2d092b5d81..467bc77c185779 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -37,9 +37,10 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0)
- ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec
+ ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
%2 = COPY $sgpr30_sgpr31
@@ -145,7 +146,7 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 8, [[FLAT_LOAD_DWORD]], implicit $exec
; SDWA-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 65535
; SDWA-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[FLAT_LOAD_DWORD]], killed [[S_MOV_B32_]], implicit $exec
- ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[FLAT_LOAD_DWORD]](tied-def 0)
+ ; SDWA-NEXT: [[V_MOV_B32_sdwa:%[0-9]+]]:vgpr_32 = V_MOV_B32_sdwa 0, [[FLAT_LOAD_DWORD1]], 0, 5, 2, 4, implicit $exec, implicit [[V_AND_B32_e64_]](tied-def 0)
; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_MOV_B32_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: S_ENDPGM 0
%2 = COPY $sgpr30_sgpr31
@@ -180,15 +181,17 @@ body: |
; SDWA-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
; SDWA-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
; SDWA-NEXT: [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+ ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
; SDWA-NEXT: {{ $}}
; SDWA-NEXT: bb.1:
; SDWA-NEXT: successors: %bb.2(0x80000000)
; SDWA-NEXT: {{ $}}
- ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+ ; SDWA-NEXT: [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 6, 0, 1, 3, implicit $mode, implicit $exec
+ ; SDWA-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[V_MUL_F32_sdwa]], implicit $exec
; SDWA-NEXT: {{ $}}
; SDWA-NEXT: bb.2:
- ; SDWA-NEXT: [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit [[V_MUL_F32_sdwa]](tied-def 0)
- ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+ ; SDWA-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_LSHLREV_B32_e64_]], implicit $exec
+ ; SDWA-NEXT: FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
; SDWA-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
; SDWA-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
bb.0:
>From 109746e5e3e0a4b8414db835ff4bbaeaa368ad23 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Thu, 23 Jan 2025 10:05:30 -0500
Subject: [PATCH 2/2] [AMDGPU] Adjust test to SIPeepholeSDWA changes
---
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 2d84e877229515..934d9efba46564 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -1230,7 +1230,8 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) {
; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0
; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0
; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1345,7 +1346,8 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) {
; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff00ff
; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v1, v0
; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0xff
-; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list