[llvm] [AMDGPU][GlobalISel] Use amdhsa target for flat/private tests (PR #110672)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 1 06:31:30 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Fabian Ritter (ritter-x2a)

<details>
<summary>Changes</summary>

As a proxy criterion, mesa targets have unaligned-access-mode (which determines whether the hardware allows unaligned memory accesses) not set whereas amdhsa targets do. This PR changes tests to use amdhsa instead of mesa and inserts additional checks with unaligned-access-mode unset explicitly.

This is in preparation for PR #<!-- -->110219, which will generate different code depending on the unaligned-access-mode.

---

Patch is 1.58 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110672.diff


3 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+1566-25) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir (+10775-1718) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir (+6905-7) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index f2ff022308cc61..688146a6000e73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1,14 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel -mattr=-unaligned-access-mode -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=UNALIGNED_GFX12 %s
 
 define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_kernel:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
@@ -28,7 +34,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
@@ -42,7 +48,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ;
 ; GFX940-LABEL: store_load_sindex_kernel:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
@@ -56,7 +62,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ;
 ; GFX11-LABEL: store_load_sindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
@@ -70,7 +76,7 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ;
 ; GFX12-LABEL: store_load_sindex_kernel:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_and_b32 s1, s0, 15
@@ -81,6 +87,82 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX9-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword off, v0, s1
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword off, v0, s0
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_load_dword s0, s[2:3], 0x0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s1, s0, 2
+; UNALIGNED_GFX940-NEXT:    s_and_b32 s0, s0, 15
+; UNALIGNED_GFX940-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX11-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_sindex_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v0, 15
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_and_b32 s1, s0, 15
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; UNALIGNED_GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -167,6 +249,82 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 0, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, 0, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_add_u32 s6, s6, s11
+; UNALIGNED_GFX10-NEXT:    s_addc_u32 s7, s7, 0
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; UNALIGNED_GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; UNALIGNED_GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, 0, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 0, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v0, 0, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 0, v1
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_endpgm
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_kernel:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -255,6 +413,82 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX9:       ; %bb.0: ; %bb
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, s32, v1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v1, v2, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v0, s32, v0
+; UNALIGNED_GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX10:       ; %bb.0: ; %bb
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v0, s32, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX940:       ; %bb.0: ; %bb
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, s32, v1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; UNALIGNED_GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX11:       ; %bb.0: ; %bb
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
+; UNALIGNED_GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; UNALIGNED_GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_b32 v0, v0, s32 glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: store_load_vindex_foo:
+; UNALIGNED_GFX12:       ; %bb.0: ; %bb
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; UNALIGNED_GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; UNALIGNED_GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
@@ -306,6 +540,47 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX9-LABEL: private_ptr_foo:
+; UNALIGNED_GFX9:       ; %bb.0:
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX10-LABEL: private_ptr_foo:
+; UNALIGNED_GFX10:       ; %bb.0:
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
+; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX940-LABEL: private_ptr_foo:
+; UNALIGNED_GFX940:       ; %bb.0:
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX11-LABEL: private_ptr_foo:
+; UNALIGNED_GFX11:       ; %bb.0:
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; UNALIGNED_GFX12-LABEL: private_ptr_foo:
+; UNALIGNED_GFX12:       ; %bb.0:
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_expcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
+; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; UNALIGNED_GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
+; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
   ret void
@@ -314,7 +589,7 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s11
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9-NEXT:    s_mov_b32 s1, 0
@@ -338,7 +613,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_addc_u32 s7, s7, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; GFX...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/110672


More information about the llvm-commits mailing list