[llvm] [AMDGPU][MC] Allow opsel for v_max_i16 etc in GFX10 (PR #143982)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 12 14:57:11 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jun Wang (jwanggit86)
<details>
<summary>Changes</summary>
In GFX10, a number of VOP3 instructions should allow opsel, including V_MAX_I16, V_MAX_U16, V_MIN_I16, V_MIN_U16, V_MUL_LO_U16, V_LSHLREV_B16, V_LSHRREV_B16, and V_ASHRREV_I16.
---
Patch is 99.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143982.diff
24 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/VOP2Instructions.td (+4)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+8-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir (+41-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir (+41-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir (+40-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+9-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+25-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+9-8)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+164-153)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-fail.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll (+3-1)
- (modified) llvm/test/MC/AMDGPU/gfx10_asm_vop3.s (+24)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt (+24)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..67fb68a3eee83 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -211,6 +211,10 @@ multiclass VOP2Inst_e64_t16<string opName,
string revOp = opName> {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
defm NAME : VOP2Inst<opName, P, node, revOp>;
+ let SubtargetPredicate = isGFX10Only in {
+ def _vop3_e64 : VOP3InstBase <opName#"_vop3", VOP3_Profile<P, VOP3_OPSEL>, node, 1>,
+ Commutable_REV<revOp#"_vop3_e64", !eq(revOp, opName)>;
+ }
}
let SubtargetPredicate = UseRealTrue16Insts in {
defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 0252c4f1b0929..597202d47591d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1932,16 +1932,14 @@ defm V_DIV_FIXUP_F16 :
defm V_ADD_NC_U16 : VOP3OpSel_Real_gfx10<0x303>;
defm V_SUB_NC_U16 : VOP3OpSel_Real_gfx10<0x304>;
-// FIXME-GFX10-OPSEL: Need to add "selective" opsel support to some of these
-// (they do not support SDWA or DPP).
-defm V_MUL_LO_U16 : VOP3_Real_gfx10_with_name<0x305, "V_MUL_LO_U16", "v_mul_lo_u16">;
-defm V_LSHRREV_B16 : VOP3_Real_gfx10_with_name<0x307, "V_LSHRREV_B16", "v_lshrrev_b16">;
-defm V_ASHRREV_I16 : VOP3_Real_gfx10_with_name<0x308, "V_ASHRREV_I16", "v_ashrrev_i16">;
-defm V_MAX_U16 : VOP3_Real_gfx10_with_name<0x309, "V_MAX_U16", "v_max_u16">;
-defm V_MAX_I16 : VOP3_Real_gfx10_with_name<0x30a, "V_MAX_I16", "v_max_i16">;
-defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16", "v_min_u16">;
-defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16", "v_min_i16">;
-defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16", "v_lshlrev_b16">;
+defm V_MUL_LO_U16 : VOP3OpSel_Real_gfx10_with_name<0x305, "V_MUL_LO_U16_vop3", "v_mul_lo_u16">;
+defm V_LSHRREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x307, "V_LSHRREV_B16_vop3", "v_lshrrev_b16">;
+defm V_ASHRREV_I16 : VOP3OpSel_Real_gfx10_with_name<0x308, "V_ASHRREV_I16_vop3", "v_ashrrev_i16">;
+defm V_MAX_U16 : VOP3OpSel_Real_gfx10_with_name<0x309, "V_MAX_U16_vop3", "v_max_u16">;
+defm V_MAX_I16 : VOP3OpSel_Real_gfx10_with_name<0x30a, "V_MAX_I16_vop3", "v_max_i16">;
+defm V_MIN_U16 : VOP3OpSel_Real_gfx10_with_name<0x30b, "V_MIN_U16_vop3", "v_min_u16">;
+defm V_MIN_I16 : VOP3OpSel_Real_gfx10_with_name<0x30c, "V_MIN_I16_vop3", "v_min_i16">;
+defm V_LSHLREV_B16 : VOP3OpSel_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_vop3", "v_lshlrev_b16">;
defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..81153dbefb360 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -864,25 +864,25 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
-; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4
+; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4
; GFX10-NEXT: v_and_b32_e32 v4, 7, v4
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
+; GFX10-NEXT: v_lshlrev_b16 v4, v4, v6
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3
-; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
-; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX10-NEXT: v_lshrrev_b16 v1, v5, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 238cc06fc7f7c..c5078c2283203 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -864,25 +864,25 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3
+; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
-; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5
+; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v5
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4
+; GFX10-NEXT: v_lshrrev_b16 v3, v3, v6
+; GFX10-NEXT: v_lshlrev_b16 v4, v5, v4
+; GFX10-NEXT: v_and_b32_e32 v5, 7, v7
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
-; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
-; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX10-NEXT: v_lshlrev_b16 v0, v5, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v1, 8, v2
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 1701a9cc7f09b..5874cebe46a37 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
-; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s4, s0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s0
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -175,16 +175,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
-; GFX10-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, s2, s0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s0
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -277,17 +277,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -383,17 +383,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, s4, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
;
@@ -487,16 +487,16 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: global_store_short v[0:1], v2, off
; GFX10-NEXT: s_endpgm
@@ -590,13 +590,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -689,13 +689,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX10-NEXT: v_mov_b32_e32 v3, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX10-NEXT: v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -788,13 +788,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
+; GFX10-NEXT: v_lshlrev_b16 v3, 8, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
index 4c3f4d9b06ed1..461021112cfef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
@@ -100,7 +100,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
; GFX11-LABEL: name: ashr_s16_s16_vs
; GFX11: liveins: $sgpr0, $vgpr0
@@ -193,7 +193,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
; GFX11-LABEL: name: ashr_s16_s16_vv
; GFX11: liveins: $vgpr0, $vgpr1
@@ -238,7 +238,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_e64_]], implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
@@ -292,7 +292,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec
@@ -442,7 +442,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
; GFX11-LABEL: name: ashr_s16_s16_sv
; GFX11: liveins: $sgpr0, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
index 4769b5f77e3b2..c17b32d5c1676 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
@@ -98,7 +98,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
; GFX11-LABEL: name: lshr_s16_s16_vs
; GFX11: liveins: $sgpr0, $vgpr0
@@ -191,7 +191,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
+ ; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_vop3_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
; GFX11-LABEL: name: lshr_s16_s16_vv
; GFX11: liveins: $vgpr0, $vgpr1
@@ -236,7 +236,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/143982
More information about the llvm-commits
mailing list