[llvm] [SeparateConstOffsetFromGEP] Don't set unsound inbounds flag (PR #130616)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 13 20:56:32 PDT 2025
================
@@ -0,0 +1,438 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FSOFF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FSON %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FSOFF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FSON %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
+
+; This test documents when memory addresses with constant offset components can
+; be folded into memory accesses with immediate offsets.
+
+define i32 @flat(ptr addrspace(0) %p, i32 %i) {
+; GFX90A-LABEL: flat:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: flat:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %idx = add nuw nsw i32 %i, 3
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(0) %p, i32 %idx
+ %l = load i32, ptr addrspace(0) %arrayidx
+ ret i32 %l
+}
+
+define i32 @global(ptr addrspace(1) %p, i32 %i) {
+; GFX90A-LABEL: global:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT: global_load_dword v0, v[0:1], off offset:12
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:12
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: global:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942-NEXT: global_load_dword v0, v[0:1], off offset:12
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:12
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: global_load_b32 v0, v[0:1], off offset:12
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %idx = add nuw nsw i32 %i, 3
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %idx
+ %l = load i32, ptr addrspace(1) %arrayidx
+ ret i32 %l
+}
+
+define i32 @lds(ptr addrspace(3) %p, i32 %i) {
----------------
arsenm wrote:
These other address spaces aren't relevant to this issue? The different address spaces should have different tests (which we also surely already have, just scattered around inconsistently)
https://github.com/llvm/llvm-project/pull/130616
More information about the llvm-commits
mailing list