[llvm] 7a41639 - [AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loads
Vang Thao via llvm-commits
llvm-commits at lists.llvm.org
Wed May 5 15:21:41 PDT 2021
Author: Vang Thao
Date: 2021-05-05T15:18:19-07:00
New Revision: 7a41639c60ab1bd3712302e2588d5c7d6d8b57dc
URL: https://github.com/llvm/llvm-project/commit/7a41639c60ab1bd3712302e2588d5c7d6d8b57dc
DIFF: https://github.com/llvm/llvm-project/commit/7a41639c60ab1bd3712302e2588d5c7d6d8b57dc.diff
LOG: [AMDGPU][GlobalISel] Widen 1 and 2 byte scalar loads
Widen 1 and 2 byte scalar loads to 4 bytes when sufficiently
aligned to avoid using a global load.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D100430
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 86686ce6660a..482aef524f6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -443,9 +443,8 @@ static bool isScalarLoadLegal(const MachineInstr &MI) {
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
-
- // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
- return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
+ // Require 4-byte alignment.
+ return MMO->getAlign() >= Align(4) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
@@ -1148,31 +1147,58 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
const RegisterBank *PtrBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
if (PtrBank == &AMDGPU::SGPRRegBank) {
- // If the pointer is an SGPR, we ordinarily have nothing to do.
- if (LoadSize != 96)
+ // There are some special cases that we need to look at for 32 bit and 96
+ // bit SGPR loads otherwise we have nothing to do.
+ if (LoadSize != 32 && LoadSize != 96)
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned MemSize = 8 * MMO->getSize();
+ // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
+ // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
+ // scalar loads should have a load size of 32 but memory access size of less
+ // than 32.
+ if (LoadSize == 32 &&
+ (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
+ return false;
+
Register PtrReg = MI.getOperand(1).getReg();
- // 96-bit loads are only available for vector loads. We need to split this
- // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, O);
- if (MMO->getAlign() < Align(16)) {
- LLT Part64, Part32;
- std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
- auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
- auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
-
- auto Undef = B.buildUndef(LoadTy);
- auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
- B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ if (LoadSize == 32) {
+ // This is an extending load from a sub-dword size. Widen the memory
+ // access size to 4 bytes and clear the extra high bits appropriately
+ const LLT S32 = LLT::scalar(32);
+ if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
+ // Must extend the sign bit into higher bits for a G_SEXTLOAD
+ auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
+ B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
+ } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
+ // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
+ auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
+ B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
+ } else
+ // We do not need to touch the higher bits for regular loads.
+ B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
} else {
- LLT WiderTy = widen96To128(LoadTy);
- auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
- B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ // 96-bit loads are only available for vector loads. We need to split this
+ // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
+ if (MMO->getAlign() < Align(16)) {
+ LLT Part64, Part32;
+ std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
+ auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
+ auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
+
+ auto Undef = B.buildUndef(LoadTy);
+ auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
+ B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+ } else {
+ LLT WiderTy = widen96To128(LoadTy);
+ auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
+ B.buildExtract(MI.getOperand(0), WideLoad, 0);
+ }
}
MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index bdc8ac7b6bbe..1e10b2390ed9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -6,18 +6,13 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
; CI-LABEL: frem_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; CI-NEXT: s_mov_b32 s10, -1
-; CI-NEXT: s_mov_b32 s11, 0xf000
-; CI-NEXT: s_mov_b64 s[2:3], s[10:11]
+; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s0, s[6:7], 0x0
+; CI-NEXT: s_load_dword s1, s[8:9], 0x2
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[8:9], s[6:7]
-; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
; CI-NEXT: v_rcp_f32_e32 v4, v2
@@ -30,7 +25,8 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
; CI-NEXT: v_fma_f32 v2, -v2, v5, v3
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; CI-NEXT: s_mov_b64 s[6:7], s[10:11]
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0
; CI-NEXT: v_trunc_f32_e32 v2, v2
; CI-NEXT: v_fma_f32 v0, -v2, v1, v0
@@ -44,24 +40,18 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s8, 8
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_addc_u32 s1, s9, 0
-; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[8:9], 0x8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_cvt_f32_f16_e32 v1, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v3, v0
-; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_mul_f32_e32 v1, v1, v3
-; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_div_fixup_f16 v1, v1, v0, v2
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v2, v2
+; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; VI-NEXT: v_trunc_f16_e32 v0, v0
+; VI-NEXT: v_fma_f16 v2, -v0, v1, s0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -78,19 +68,15 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace
; CI-LABEL: fast_frem_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; CI-NEXT: s_mov_b32 s10, -1
-; CI-NEXT: s_mov_b32 s11, 0xf000
-; CI-NEXT: s_mov_b64 s[2:3], s[10:11]
+; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[8:9], s[6:7]
-; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
-; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; CI-NEXT: s_mov_b64 s[6:7], s[10:11]
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_load_dword s0, s[6:7], 0x0
+; CI-NEXT: s_load_dword s1, s[8:9], 0x2
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_mov_b32 s7, 0xf000
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
; CI-NEXT: v_rcp_f32_e32 v2, v1
; CI-NEXT: v_mul_f32_e32 v2, v0, v2
; CI-NEXT: v_trunc_f32_e32 v2, v2
@@ -104,19 +90,14 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s8, 8
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_addc_u32 s1, s9, 0
-; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_rcp_f16_e32 v1, v0
-; VI-NEXT: v_mul_f16_e32 v1, v2, v1
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[8:9], 0x8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
+; VI-NEXT: v_trunc_f16_e32 v0, v0
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -133,19 +114,15 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa
; CI-LABEL: unsafe_frem_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; CI-NEXT: s_mov_b32 s10, -1
-; CI-NEXT: s_mov_b32 s11, 0xf000
-; CI-NEXT: s_mov_b64 s[2:3], s[10:11]
+; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s0, s[6:7], 0x0
+; CI-NEXT: s_load_dword s1, s[8:9], 0x2
+; CI-NEXT: s_mov_b32 s6, -1
+; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_mov_b64 s[8:9], s[6:7]
-; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8
-; CI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; CI-NEXT: s_mov_b64 s[6:7], s[10:11]
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
; CI-NEXT: v_rcp_f32_e32 v2, v1
; CI-NEXT: v_mul_f32_e32 v2, v0, v2
; CI-NEXT: v_trunc_f32_e32 v2, v2
@@ -159,19 +136,14 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s8, 8
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_addc_u32 s1, s9, 0
-; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_rcp_f16_e32 v1, v0
-; VI-NEXT: v_mul_f16_e32 v1, v2, v1
-; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v2, -v1, v0, v2
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[8:9], 0x8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
+; VI-NEXT: v_trunc_f16_e32 v0, v0
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
new file mode 100644
index 000000000000..39c8fda387e0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-widen-scalar-loads.mir
@@ -0,0 +1,493 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -global-isel -march=amdgcn -mcpu=fiji -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+---
+name: constant_load_i8_align8
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_load_i8_align8
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: constant_load_i8_align8
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: constant_load_i8_align8
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, align 8, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 8, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_load_i8_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_load_i8_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: constant_load_i8_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: constant_load_i8_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 4, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_load_i16_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_load_i16_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: constant_load_i16_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: constant_load_i16_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 2, align 4, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_sextload_i8_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_sextload_i8_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
+ ; GFX8: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ; GFX9-LABEL: name: constant_sextload_i8_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
+ ; GFX9: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ; GFX10-LABEL: name: constant_sextload_i8_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
+ ; GFX10: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 4, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_sextload_i16_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_sextload_i16_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
+ ; GFX8: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ; GFX9-LABEL: name: constant_sextload_i16_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
+ ; GFX9: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ; GFX10-LABEL: name: constant_sextload_i16_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 16
+ ; GFX10: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 2, align 4, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+
+---
+name: constant_zextload_i8_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_zextload_i8_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX8: S_ENDPGM 0, implicit [[AND]](s32)
+ ; GFX9-LABEL: name: constant_zextload_i8_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX9: S_ENDPGM 0, implicit [[AND]](s32)
+ ; GFX10-LABEL: name: constant_zextload_i8_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255
+ ; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX10: S_ENDPGM 0, implicit [[AND]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 1, align 4, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_zextload_i16_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_zextload_i16_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX8: S_ENDPGM 0, implicit [[AND]](s32)
+ ; GFX9-LABEL: name: constant_zextload_i16_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX9: S_ENDPGM 0, implicit [[AND]](s32)
+ ; GFX10-LABEL: name: constant_zextload_i16_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 4)
+ ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX10: S_ENDPGM 0, implicit [[AND]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 4, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: global_load_i8_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: global_load_i8_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: global_load_i8_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: global_load_i8_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 4, addrspace 1)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: global_load_i16_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: global_load_i16_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: global_load_i16_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: global_load_i16_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 2, align 4, addrspace 1)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: global_sextload_i8_alig4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: global_sextload_i8_alig4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX8: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
+ ; GFX8: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ; GFX9-LABEL: name: global_sextload_i8_alig4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX9: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
+ ; GFX9: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ ; GFX10-LABEL: name: global_sextload_i8_alig4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX10: [[SEXT_INREG:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[LOAD]], 8
+ ; GFX10: S_ENDPGM 0, implicit [[SEXT_INREG]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 4, addrspace 1)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: global_zextload_i16_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: global_zextload_i16_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX8: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; GFX8: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX8: S_ENDPGM 0, implicit [[AND]](s32)
+ ; GFX9-LABEL: name: global_zextload_i16_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; GFX9: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX9: S_ENDPGM 0, implicit [[AND]](s32)
+ ; GFX10-LABEL: name: global_zextload_i16_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4, addrspace 1)
+ ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; GFX10: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[LOAD]], [[C]]
+ ; GFX10: S_ENDPGM 0, implicit [[AND]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 4, addrspace 1)
+ S_ENDPGM 0, implicit %1
+...
+# Some negative test cases
+---
+name: constant_load_i8_align2
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_load_i8_align2
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: constant_load_i8_align2
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: constant_load_i8_align2
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 1, align 2, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_load_i16_align2
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_load_i16_align2
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: constant_load_i16_align2
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: constant_load_i16_align2
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (invariant load 2, align 2, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_sextload_i8_align2
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_sextload_i8_align2
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX9-LABEL: name: constant_sextload_i8_align2
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX10-LABEL: name: constant_sextload_i8_align2
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 1, align 2, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_sextload_i16_align2
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_sextload_i16_align2
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX9-LABEL: name: constant_sextload_i16_align2
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ ; GFX10-LABEL: name: constant_sextload_i16_align2
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[SEXTLOAD:%[0-9]+]]:vgpr(s32) = G_SEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[SEXTLOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_SEXTLOAD %0 :: (invariant load 2, align 2, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_zextload_i8_align2
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_zextload_i8_align2
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX9-LABEL: name: constant_zextload_i8_align2
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX10-LABEL: name: constant_zextload_i8_align2
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 1, align 2, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 1, align 2, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: constant_zextload_i16_align2
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: constant_zextload_i16_align2
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX8: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX9-LABEL: name: constant_zextload_i16_align2
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX9: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ ; GFX10-LABEL: name: constant_zextload_i16_align2
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[ZEXTLOAD:%[0-9]+]]:vgpr(s32) = G_ZEXTLOAD [[COPY1]](p1) :: (invariant load 2, addrspace 4)
+ ; GFX10: S_ENDPGM 0, implicit [[ZEXTLOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_ZEXTLOAD %0 :: (invariant load 2, align 2, addrspace 4)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: local_load_i8_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: local_load_i8_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 1, align 4, addrspace 3)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: local_load_i8_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 1, align 4, addrspace 3)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: local_load_i8_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 1, align 4, addrspace 3)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (load 1, align 4, addrspace 3)
+ S_ENDPGM 0, implicit %1
+...
+---
+name: private_load_i8_align4
+legalized: true
+
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+ ; GFX8-LABEL: name: private_load_i8_align4
+ ; GFX8: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX8: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX8: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 1, align 4, addrspace 5)
+ ; GFX8: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX9-LABEL: name: private_load_i8_align4
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX9: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 1, align 4, addrspace 5)
+ ; GFX9: S_ENDPGM 0, implicit [[LOAD]](s32)
+ ; GFX10-LABEL: name: private_load_i8_align4
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+ ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+ ; GFX10: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 1, align 4, addrspace 5)
+ ; GFX10: S_ENDPGM 0, implicit [[LOAD]](s32)
+ %0:_(p1) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_LOAD %0 :: (load 1, align 4, addrspace 5)
+ S_ENDPGM 0, implicit %1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
new file mode 100644
index 000000000000..7d286503b4a2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -0,0 +1,430 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+
+define amdgpu_kernel void @constant_load_i8_align4(i8 addrspace (1)* %out, i8 addrspace(4)* %in) #0 {
+; GFX8-LABEL: constant_load_i8_align4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: flat_store_byte v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_load_i8_align4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: constant_load_i8_align4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %ld = load i8, i8 addrspace(4)* %in, align 4
+ store i8 %ld, i8 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @constant_load_i16_align4(i16 addrspace (1)* %out, i16 addrspace(4)* %in) #0 {
+; GFX8-LABEL: constant_load_i16_align4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_load_i16_align4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: constant_load_i16_align4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %ld = load i16, i16 addrspace(4)* %in, align 4
+ store i16 %ld, i16 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sextload_i8_to_i32_align4(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; GFX8-LABEL: sextload_i8_to_i32_align4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: sextload_i8_to_i32_align4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sext_i32_i8 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sextload_i8_to_i32_align4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sext_i32_i8 s2, s2
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %load = load i8, i8 addrspace(1)* %in, align 4
+ %sext = sext i8 %load to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sextload_i16_to_i32_align4(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+; GFX8-LABEL: sextload_i16_to_i32_align4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: sextload_i16_to_i32_align4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sext_i32_i16 s2, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: sextload_i16_to_i32_align4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_sext_i32_i16 s2, s2
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %load = load i16, i16 addrspace(1)* %in, align 4
+ %sext = sext i16 %load to i32
+ store i32 %sext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @zextload_i8_to_i32_align4(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; GFX8-LABEL: zextload_i8_to_i32_align4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: zextload_i8_to_i32_align4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s2, s2, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: zextload_i8_to_i32_align4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s2, s2, 0xff
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %load = load i8, i8 addrspace(1)* %in, align 4
+ %zext = zext i8 %load to i32
+ store i32 %zext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @zextload_i16_to_i32_align4(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+; GFX8-LABEL: zextload_i16_to_i32_align4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: zextload_i16_to_i32_align4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: zextload_i16_to_i32_align4:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %load = load i16, i16 addrspace(1)* %in, align 4
+ %zext = zext i16 %load to i32
+ store i32 %zext, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; GFX8-LABEL: constant_load_i8_align2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_byte v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_load_i8_align2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: constant_load_i8_align2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %load = load i8, i8 addrspace(1)* %in, align 2
+ store i8 %load, i8 addrspace(1)* %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @constant_load_i16_align2(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+; GFX8-LABEL: constant_load_i16_align2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ushort v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_load_i16_align2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: constant_load_i16_align2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+ %load = load i16, i16 addrspace(1)* %in, align 2
+ store i16 %load, i16 addrspace(1)* %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @constant_sextload_i8_align2(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; GFX8-LABEL: constant_sextload_i8_align2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s0, 2
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_sextload_i8_align2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: constant_sextload_i8_align2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
+; GFX10-NEXT: s_endpgm
+ %load = load i8, i8 addrspace(1)* %in, align 2
+ %sextload = sext i8 %load to i32
+ store i32 %sextload, i32 addrspace(1)* %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @constant_zextload_i8_align2(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; GFX8-LABEL: constant_zextload_i8_align2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s0, 2
+; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: constant_zextload_i8_align2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: constant_zextload_i8_align2:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
+; GFX10-NEXT: s_endpgm
+ %load = load i8, i8 addrspace(1)* %in, align 2
+ %zextload = zext i8 %load to i32
+ store i32 %zextload, i32 addrspace(1)* %out, align 2
+ ret void
+}
+
+attributes #0 = { nounwind }
More information about the llvm-commits
mailing list