[llvm-branch-commits] [llvm] [AMDGPU][True16] Legalize extloads into 16-bit registers (PR #198670)
Domenic Nutile via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 25 07:58:00 PDT 2026
https://github.com/saxlungs updated https://github.com/llvm/llvm-project/pull/198670
>From 9a486ec8077cded34e3cb9d93628cb11eb848cff Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Fri, 1 May 2026 12:36:31 -0400
Subject: [PATCH 1/3] [AMDGPU][True16] Legalize extloads into 16-bit registers
Signed-off-by: Domenic Nutile <domenic.nutile at gmail.com>
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 +-
llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 118 ++++++++++++------
2 files changed, 82 insertions(+), 40 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 155294fae1781..b665421c69371 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -464,8 +464,8 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
MemSize = std::max(MemSize, Align);
#endif
- // Only 1-byte and 2-byte to 32-bit extloads are valid.
- if (MemSize != RegSize && RegSize != 32)
+ // Only allow extloads to up to 32 bits.
+ if (MemSize != RegSize && RegSize > 32)
return false;
if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 2a07c3aa776d6..6ac69cc7e2f51 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2279,15 +2279,15 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %
; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
;
; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
; GFX1250-SDAG-TRUE16: ; %bb.0:
@@ -2297,6 +2297,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %
; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog
;
+; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX1250-GISEL-TRUE16: ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
; GFX1250-NOECC: ; %bb.0:
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -2322,15 +2332,15 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p
; GFX1250-SDAG-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
;
; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
; GFX1250-SDAG-TRUE16: ; %bb.0:
@@ -2340,6 +2350,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p
; GFX1250-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.h
; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog
;
+; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL-TRUE16: ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
; GFX1250-NOECC: ; %bb.0:
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -2754,16 +2774,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
;
; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
; GFX1250-SDAG-TRUE16: ; %bb.0:
@@ -2775,6 +2795,17 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %
; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog
;
+; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX1250-GISEL-TRUE16: ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3]
+; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
; GFX1250-NOECC: ; %bb.0:
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
@@ -2800,16 +2831,16 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p
; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-GISEL-FAKE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL-FAKE16: ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-FAKE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-FAKE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog
;
; GFX1250-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
; GFX1250-SDAG-TRUE16: ; %bb.0:
@@ -2821,6 +2852,17 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p
; GFX1250-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1
; GFX1250-SDAG-TRUE16-NEXT: ; return to shader part epilog
;
+; GFX1250-GISEL-TRUE16-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX1250-GISEL-TRUE16: ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-GISEL-TRUE16-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128
+; GFX1250-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX1250-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
; GFX1250-NOECC: ; %bb.0:
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
>From aa095dbeb51fa74705233606753ef4ed341fc544 Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at gmail.com>
Date: Tue, 19 May 2026 16:24:25 -0400
Subject: [PATCH 2/3] Add legalize rules and fix tests
---
.../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 9 +-
.../legalize-sextload-s16-true16.mir | 15 +-
.../CodeGen/AMDGPU/GlobalISel/load-d16.ll | 228 +++++++++++++-----
llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 114 +++++++--
4 files changed, 268 insertions(+), 98 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index a5f0facadadce..e16c013ef27b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1121,29 +1121,34 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
.Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
.Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
-
+
.Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
.Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
+ .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
.Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
+ .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
.Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
.Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
.Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
.Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
.Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
+ .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
.Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
.Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
+ .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
.Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
.Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
.Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
.Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
- .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}});
+ .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}})
+ .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16);
addRulesForGOpcs({G_STORE})
// addrspace(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir
index 99f4418e7a978..399bf4ab9b764 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-s16-true16.mir
@@ -13,9 +13,8 @@ body: |
; TRUE16-NEXT: {{ $}}
; TRUE16-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
; TRUE16-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr2
- ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5)
- ; TRUE16-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXTLOAD]](s32)
- ; TRUE16-NEXT: G_STORE [[TRUNC]](s16), [[COPY]](p1) :: (store (s16), addrspace 1)
+ ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5)
+ ; TRUE16-NEXT: G_STORE [[SEXTLOAD]](s16), [[COPY]](p1) :: (store (s16), addrspace 1)
;
; FAKE16-LABEL: name: test_sextload_global_s16_from_s8
; FAKE16: liveins: $vgpr0_vgpr1, $vgpr2
@@ -41,9 +40,8 @@ body: |
; TRUE16-NEXT: {{ $}}
; TRUE16-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
; TRUE16-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr1
- ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5)
- ; TRUE16-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXTLOAD]](s32)
- ; TRUE16-NEXT: G_STORE [[TRUNC]](s16), [[COPY]](p3) :: (store (s16), addrspace 3)
+ ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5)
+ ; TRUE16-NEXT: G_STORE [[SEXTLOAD]](s16), [[COPY]](p3) :: (store (s16), addrspace 3)
;
; FAKE16-LABEL: name: test_sextload_local_s16_from_s8
; FAKE16: liveins: $vgpr0, $vgpr1
@@ -69,9 +67,8 @@ body: |
; TRUE16-NEXT: {{ $}}
; TRUE16-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
; TRUE16-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr1
- ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5)
- ; TRUE16-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXTLOAD]](s32)
- ; TRUE16-NEXT: G_STORE [[TRUNC]](s16), [[COPY]](p5) :: (store (s16), addrspace 5)
+ ; TRUE16-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY1]](p5) :: (load (s8), addrspace 5)
+ ; TRUE16-NEXT: G_STORE [[SEXTLOAD]](s16), [[COPY]](p5) :: (store (s16), addrspace 5)
;
; FAKE16-LABEL: name: test_sextload_private_s16_from_s8
; FAKE16: liveins: $vgpr0, $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
index 166f439a61430..387944cf7811a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-d16.ll
@@ -29,12 +29,22 @@ define amdgpu_ps void @load_P0_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra
}
define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
-; GFX12-LABEL: sextload_P0_i8_D16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: flat_load_d16_i8 v0, v[1:2]
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: flat_store_b32 v[3:4], v0
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P0_i8_D16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: flat_load_d16_i8 v1, v[1:2]
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
+; GFX12-TRUE16-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P0_i8_D16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: flat_load_d16_i8 v0, v[1:2]
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(0) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 0
@@ -43,12 +53,23 @@ define amdgpu_ps void @sextload_P0_i8_D16(<2 x i16> %vec, ptr addrspace(0) %ptra
}
define amdgpu_ps void @sextload_P0_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(0) %ptra, ptr addrspace(0) %out) {
-; GFX12-LABEL: sextload_P0_i8_D16_Hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: flat_load_d16_hi_i8 v0, v[1:2]
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: flat_store_b32 v[3:4], v0
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P0_i8_D16_Hi:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: flat_load_d16_i8 v1, v[1:2]
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P0_i8_D16_Hi:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: flat_load_d16_hi_i8 v0, v[1:2]
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: flat_store_b32 v[3:4], v0
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(0) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 1
@@ -111,12 +132,22 @@ define amdgpu_ps void @load_P1_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra
}
define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
-; GFX12-LABEL: sextload_P1_i8_D16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v[3:4], v0, off
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P1_i8_D16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
+; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P1_i8_D16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: global_load_d16_i8 v0, v[1:2], off
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(1) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 0
@@ -125,12 +156,23 @@ define amdgpu_ps void @sextload_P1_i8_D16(<2 x i16> %vec, ptr addrspace(1) %ptra
}
define amdgpu_ps void @sextload_P1_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(1) %ptra, ptr addrspace(1) %out) {
-; GFX12-LABEL: sextload_P1_i8_D16_Hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v[3:4], v0, off
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P1_i8_D16_Hi:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P1_i8_D16_Hi:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(1) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 1
@@ -193,12 +235,22 @@ define amdgpu_ps void @load_P3_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra
}
define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
-; GFX12-LABEL: sextload_P3_i8_D16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: ds_load_i8_d16 v0, v1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: ds_store_b32 v2, v0
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P3_i8_D16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: ds_load_i8_d16 v1, v1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
+; GFX12-TRUE16-NEXT: ds_store_b32 v2, v0
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P3_i8_D16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: ds_load_i8_d16 v0, v1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: ds_store_b32 v2, v0
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(3) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 0
@@ -207,12 +259,23 @@ define amdgpu_ps void @sextload_P3_i8_D16(<2 x i16> %vec, ptr addrspace(3) %ptra
}
define amdgpu_ps void @sextload_P3_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(3) %ptra, ptr addrspace(3) %out) {
-; GFX12-LABEL: sextload_P3_i8_D16_Hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: ds_load_i8_d16_hi v0, v1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: ds_store_b32 v2, v0
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P3_i8_D16_Hi:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: ds_load_i8_d16 v1, v1
+; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: ds_store_b32 v2, v0
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P3_i8_D16_Hi:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: ds_load_i8_d16_hi v0, v1
+; GFX12-FAKE16-NEXT: s_wait_dscnt 0x0
+; GFX12-FAKE16-NEXT: ds_store_b32 v2, v0
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(3) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 1
@@ -275,12 +338,22 @@ define amdgpu_ps void @load_P4_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra
}
define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
-; GFX12-LABEL: sextload_P4_i8_D16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_i8 v0, v[1:2], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v[3:4], v0, off
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P4_i8_D16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
+; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P4_i8_D16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: global_load_d16_i8 v0, v[1:2], off
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 0
@@ -289,12 +362,23 @@ define amdgpu_ps void @sextload_P4_i8_D16(<2 x i16> %vec, ptr addrspace(4) %ptra
}
define amdgpu_ps void @sextload_P4_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(4) %ptra, ptr addrspace(1) %out) {
-; GFX12-LABEL: sextload_P4_i8_D16_Hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v[3:4], v0, off
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P4_i8_D16_Hi:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: global_load_d16_i8 v1, v[1:2], off
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P4_i8_D16_Hi:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: global_load_d16_hi_i8 v0, v[1:2], off
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(4) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 1
@@ -357,12 +441,22 @@ define amdgpu_ps void @load_P5_B16_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra
}
define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
-; GFX12-LABEL: sextload_P5_i8_D16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: scratch_load_d16_i8 v0, v1, off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: scratch_store_b32 v2, v0, off
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P5_i8_D16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: scratch_load_d16_i8 v1, v1, off
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1
+; GFX12-TRUE16-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P5_i8_D16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: scratch_load_d16_i8 v0, v1, off
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(5) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 0
@@ -371,12 +465,23 @@ define amdgpu_ps void @sextload_P5_i8_D16(<2 x i16> %vec, ptr addrspace(5) %ptra
}
define amdgpu_ps void @sextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %ptra, ptr addrspace(5) %out) {
-; GFX12-LABEL: sextload_P5_i8_D16_Hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: scratch_load_d16_hi_i8 v0, v1, off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: scratch_store_b32 v2, v0, off
-; GFX12-NEXT: s_endpgm
+; GFX12-TRUE16-LABEL: sextload_P5_i8_D16_Hi:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: scratch_load_d16_i8 v1, v1, off
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sextload_P5_i8_D16_Hi:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: scratch_load_d16_hi_i8 v0, v1, off
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: scratch_store_b32 v2, v0, off
+; GFX12-FAKE16-NEXT: s_endpgm
%a = load i8, ptr addrspace(5) %ptra
%a16 = sext i8 %a to i16
%res = insertelement <2 x i16> %vec, i16 %a16, i32 1
@@ -411,6 +516,3 @@ define amdgpu_ps void @zextload_P5_i8_D16_Hi(<2 x i16> %vec, ptr addrspace(5) %p
store <2 x i16> %res, ptr addrspace(5) %out
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-FAKE16: {{.*}}
-; GFX12-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 9e4c6e6935596..2c16351a9bb4d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4203,12 +4203,28 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrs
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3]
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: global_load_d16_i8 v1, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
@@ -4233,12 +4249,28 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4596,12 +4628,29 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
@@ -4626,12 +4675,29 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
;
-; GFX12-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: global_load_d16_i8 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
>From b075400f070477d63df1cfaa320f8c7c39478aab Mon Sep 17 00:00:00 2001
From: Domenic Nutile <domenic.nutile at amd.com>
Date: Thu, 21 May 2026 11:14:47 -0400
Subject: [PATCH 3/3] Update comment around destination reg size for clarity
---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b665421c69371..5ed7255a97c6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -464,7 +464,11 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
MemSize = std::max(MemSize, Align);
#endif
- // Only allow extloads to up to 32 bits.
+ // We want to allow extending loads into up to a 32-bit destination register.
+ // However, this would potentially allow 16-bit destinations even without
+ // True16. This function is used by isLoadStoreLegal, which will also call
+ // isRegisterType on the destination register type which will disallow
+ // 16-bit types without True16, so this is safe.
if (MemSize != RegSize && RegSize > 32)
return false;
More information about the llvm-branch-commits
mailing list