[llvm] use vgpr_16 for scalar_vector pattern (PR #154875)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 13:23:54 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/154875
>From ef99f09e60da7656b108c89bab95bed007014ab5 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 21 Aug 2025 16:05:16 -0400
Subject: [PATCH] use vgpr_16 for scalar_vector and v_cmp pattern
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 41 +++++++--
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 5 +-
llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll | 4 +-
.../CodeGen/AMDGPU/flat-scratch-i8-i16.ll | 30 ++++---
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 88 +++++++++++++------
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 58 ++++++++----
6 files changed, 157 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e8b4501226732..af8749b51f69b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3096,6 +3096,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
+>;
}
let True16Predicate = UseRealTrue16Insts in {
@@ -3115,6 +3120,11 @@ def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16)
>;
+
+def : GCNPat <
+ (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+ (V_CMP_EQ_U16_t16_e64 (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i16 1))
+>;
}
def : GCNPat <
@@ -3143,11 +3153,6 @@ def : GCNPat <
(V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
>;
-def : GCNPat <
- (i1 (DivergentUnaryFrag<trunc> i16:$a)),
- (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
->;
-
def IMMBitSelConst : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N),
MVT::i32);
@@ -3752,7 +3757,8 @@ def : GCNPat <
(v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
>;
-
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(v2f16 (scalar_to_vector f16:$src0)),
(COPY $src0)
@@ -3772,6 +3778,29 @@ def : GCNPat <
(v4f16 (scalar_to_vector f16:$src0)),
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (v2f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v2i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat <
+ (v4i16 (scalar_to_vector i16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat <
+ (v4f16 (scalar_to_vector f16:$src0)),
+ (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+}
def : GCNPat <
(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index d4581672dab39..b660a81ce264f 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -689,9 +689,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off offset:2
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v3, off, off offset:2
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
index 6bb7cdd40a360..0a4edd142d8db 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll
@@ -166,7 +166,7 @@ define amdgpu_kernel void @load_i16_hi(ptr %arg, ptr %out) {
; GFX11-LABEL: load_i16_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8
@@ -272,7 +272,7 @@ define amdgpu_kernel void @load_half_hi(ptr %arg, ptr %out) {
; GFX11-LABEL: load_half_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index 47910f5280bfc..04b036cafd81f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -263,7 +263,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i
;
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, -1
; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v3
@@ -301,7 +302,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i
;
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, -1
; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v3
@@ -339,7 +341,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p
;
; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, -1
; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v3
@@ -696,7 +699,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
;
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_mov_b32_e32 v2, -1
+; GFX11-NEXT: v_mov_b16_e32 v2.l, -1
; GFX11-NEXT: s_add_i32 s0, s0, 1
; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -735,7 +738,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
;
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_mov_b32_e32 v2, -1
+; GFX11-NEXT: v_mov_b16_e32 v2.l, -1
; GFX11-NEXT: s_add_i32 s0, s0, 1
; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -774,7 +777,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
;
; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_mov_b32_e32 v2, -1
+; GFX11-NEXT: v_mov_b16_e32 v2.l, -1
; GFX11-NEXT: s_add_i32 s0, s0, 2
; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1165,8 +1168,9 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5)
;
; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1208,8 +1212,9 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5)
;
; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1251,8 +1256,9 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre
;
; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_mov_b16_e32 v3.l, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 1602e31d6147c..4aba073d70cff 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4393,21 +4393,37 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
+; GFX11-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX12-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
; GFX12-GISEL-TRUE16: ; %bb.0:
@@ -4439,21 +4455,37 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
-; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-SDAG-NEXT: ; return to shader part epilog
+; GFX11-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX12-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
; GFX12-GISEL-TRUE16: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 7d98f7f1706b2..65a99d0d097f9 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -137,25 +137,45 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; GFX10-NEXT: ds_write_b32 v1, v3
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: local_store_i55:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s3, s1, 0xffff
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[4:5] offset:14
-; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
-; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6
-; GFX11-NEXT: ds_store_b16 v1, v2 offset:4
-; GFX11-NEXT: ds_store_b32 v1, v3
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: local_store_i55:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: global_load_d16_hi_u8 v1, v0, s[4:5] offset:14
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, s3, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x7fffff, v0
+; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v1, v0 offset:6
+; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2 offset:4
+; GFX11-TRUE16-NEXT: ds_store_b32 v1, v3
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: local_store_i55:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xffff
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-FAKE16-NEXT: global_load_d16_hi_u8 v0, v0, s[4:5] offset:14
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
+; GFX11-FAKE16-NEXT: ds_store_b8_d16_hi v1, v0 offset:6
+; GFX11-FAKE16-NEXT: ds_store_b16 v1, v2 offset:4
+; GFX11-FAKE16-NEXT: ds_store_b32 v1, v3
+; GFX11-FAKE16-NEXT: s_endpgm
store i55 %arg, ptr addrspace(3) %ptr, align 8
ret void
}
More information about the llvm-commits
mailing list