[llvm] [AMDGPU] Allow casts between the Global and Constant Addr Spaces in isValidAddrSpaceCast (PR #112493)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 16 00:55:14 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Fabian Ritter (ritter-x2a)
<details>
<summary>Changes</summary>
So far, `isValidAddrSpaceCast` only allows casts to the flat address space and between the constant(32) address spaces. It does not allow casting between the global and constant address spaces, even though they alias. That affects, e.g., the lowering of memmoves from the constant to the global address space in LowerMemIntrinsics, since that requires aliasing address spaces to be castable.
This patch allows such casts. It also includes a memmove test that would crash with the previous implementation because the memmove IR lowering would not be applicable for the move from constant AS to global AS.
---
Patch is 110.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112493.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+6-3)
- (added) llvm/test/CodeGen/AMDGPU/memmove-var-size.ll (+2333)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 022af501289af8..b8723e3f04b51c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -187,9 +187,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
}
return false;
}
- if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
- ToAS == AMDGPUAS::CONSTANT_ADDRESS) ||
- (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
+ if (FromAS != ToAS &&
+ (FromAS == AMDGPUAS::GLOBAL_ADDRESS ||
+ FromAS == AMDGPUAS::CONSTANT_ADDRESS ||
+ FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ (ToAS == AMDGPUAS::GLOBAL_ADDRESS ||
+ ToAS == AMDGPUAS::CONSTANT_ADDRESS ||
ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
return true;
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
new file mode 100644
index 00000000000000..2b89f760709097
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -0,0 +1,2333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 %s -o - | FileCheck %s
+
+; Check code generation for memmoves with statically unknown size and all
+; combinations of the following address spaces:
+; destination address space: 0, 1, 3, 5
+; source address space: 0, 1, 3, 4, 5
+
+define void @memmove_p0_p0(ptr addrspace(0) align 1 %dst, ptr addrspace(0) align 1 readonly %src, i64 %sz) {
+; CHECK-LABEL: memmove_p0_p0:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v9, 0
+; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
+; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT: s_mov_b32 s6, exec_lo
+; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
+; CHECK-NEXT: s_cbranch_execnz .LBB0_3
+; CHECK-NEXT: ; %bb.1: ; %Flow37
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
+; CHECK-NEXT: s_cbranch_execnz .LBB0_10
+; CHECK-NEXT: .LBB0_2: ; %Flow38
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB0_3: ; %memmove_copy_forward
+; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; CHECK-NEXT: s_cbranch_execz .LBB0_6
+; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
+; CHECK-NEXT: v_mov_b32_e32 v11, v3
+; CHECK-NEXT: v_mov_b32_e32 v13, v1
+; CHECK-NEXT: v_mov_b32_e32 v10, v2
+; CHECK-NEXT: v_mov_b32_e32 v12, v0
+; CHECK-NEXT: s_mov_b32 s9, 0
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB0_5: ; %memmove_fwd_main_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: flat_load_dwordx4 v[14:17], v[10:11]
+; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
+; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
+; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT: s_or_b32 s9, s5, s9
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[14:17]
+; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; CHECK-NEXT: s_cbranch_execnz .LBB0_5
+; CHECK-NEXT: .LBB0_6: ; %Flow32
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: s_and_saveexec_b32 s8, s4
+; CHECK-NEXT: s_cbranch_execz .LBB0_9
+; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
+; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8
+; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: s_mov_b32 s9, 0
+; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
+; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB0_8: ; %memmove_fwd_residual_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: flat_load_ubyte v4, v[2:3]
+; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT: s_or_b32 s9, s5, s9
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4
+; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; CHECK-NEXT: s_cbranch_execnz .LBB0_8
+; CHECK-NEXT: .LBB0_9: ; %Flow30
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
+; CHECK-NEXT: s_cbranch_execz .LBB0_2
+; CHECK-NEXT: .LBB0_10: ; %memmove_copy_backwards
+; CHECK-NEXT: s_and_saveexec_b32 s7, s4
+; CHECK-NEXT: s_cbranch_execz .LBB0_13
+; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
+; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: s_mov_b32 s8, 0
+; CHECK-NEXT: v_add_co_u32 v4, s4, v0, v10
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v1, v11, s4
+; CHECK-NEXT: v_add_co_u32 v10, s4, v2, v10
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v3, v11, s4
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB0_12: ; %memmove_bwd_residual_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: flat_load_ubyte v12, v[10:11]
+; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_u32 v10, s4, v10, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v11, s4
+; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT: s_or_b32 s8, s4, s8
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[4:5], v12
+; CHECK-NEXT: v_add_co_u32 v4, s5, v4, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s5, -1, v5, s5
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: s_cbranch_execnz .LBB0_12
+; CHECK-NEXT: .LBB0_13: ; %Flow36
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
+; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; CHECK-NEXT: s_cbranch_execz .LBB0_16
+; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
+; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB0_15: ; %memmove_bwd_main_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1
+; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
+; CHECK-NEXT: s_cbranch_execnz .LBB0_15
+; CHECK-NEXT: .LBB0_16: ; %Flow34
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p0.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 %sz, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p1(ptr addrspace(0) align 1 %dst, ptr addrspace(1) align 1 readonly %src, i64 %sz) {
+; CHECK-LABEL: memmove_p0_p1:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v9, 0
+; CHECK-NEXT: v_and_b32_e32 v8, 15, v4
+; CHECK-NEXT: v_lshrrev_b64 v[6:7], 4, v[4:5]
+; CHECK-NEXT: s_mov_b32 s6, exec_lo
+; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1]
+; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
+; CHECK-NEXT: s_cbranch_execnz .LBB1_3
+; CHECK-NEXT: ; %bb.1: ; %Flow41
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
+; CHECK-NEXT: s_cbranch_execnz .LBB1_10
+; CHECK-NEXT: .LBB1_2: ; %Flow42
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB1_3: ; %memmove_copy_forward
+; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; CHECK-NEXT: s_cbranch_execz .LBB1_6
+; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
+; CHECK-NEXT: v_mov_b32_e32 v11, v3
+; CHECK-NEXT: v_mov_b32_e32 v13, v1
+; CHECK-NEXT: v_mov_b32_e32 v10, v2
+; CHECK-NEXT: v_mov_b32_e32 v12, v0
+; CHECK-NEXT: s_mov_b32 s9, 0
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB1_5: ; %memmove_fwd_main_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: global_load_dwordx4 v[14:17], v[10:11], off
+; CHECK-NEXT: v_add_co_u32 v6, s5, v6, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, s5, -1, v7, s5
+; CHECK-NEXT: v_add_co_u32 v10, s5, v10, 16
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, 0, v11, s5
+; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[6:7]
+; CHECK-NEXT: s_or_b32 s9, s5, s9
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[14:17]
+; CHECK-NEXT: v_add_co_u32 v12, s6, v12, 16
+; CHECK-NEXT: v_add_co_ci_u32_e64 v13, s6, 0, v13, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; CHECK-NEXT: s_cbranch_execnz .LBB1_5
+; CHECK-NEXT: .LBB1_6: ; %Flow36
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: s_and_saveexec_b32 s8, s4
+; CHECK-NEXT: s_cbranch_execz .LBB1_9
+; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
+; CHECK-NEXT: v_sub_co_u32 v4, s5, v4, v8
+; CHECK-NEXT: v_subrev_co_ci_u32_e64 v5, s5, 0, v5, s5
+; CHECK-NEXT: s_mov_b32 s9, 0
+; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v5, s5
+; CHECK-NEXT: v_add_co_u32 v2, s5, v2, v4
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, v3, v5, s5
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB1_8: ; %memmove_fwd_residual_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: global_load_ubyte v4, v[2:3], off
+; CHECK-NEXT: v_add_co_u32 v8, s5, v8, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s5, -1, v9, s5
+; CHECK-NEXT: v_add_co_u32 v2, s5, v2, 1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v3, s5, 0, v3, s5
+; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[8:9]
+; CHECK-NEXT: s_or_b32 s9, s5, s9
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v4
+; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; CHECK-NEXT: s_cbranch_execnz .LBB1_8
+; CHECK-NEXT: .LBB1_9: ; %Flow34
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: ; implicit-def: $vgpr6_vgpr7
+; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT: ; implicit-def: $vgpr8_vgpr9
+; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
+; CHECK-NEXT: s_cbranch_execz .LBB1_2
+; CHECK-NEXT: .LBB1_10: ; %memmove_copy_backwards
+; CHECK-NEXT: s_and_saveexec_b32 s7, s4
+; CHECK-NEXT: s_cbranch_execz .LBB1_13
+; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
+; CHECK-NEXT: v_add_co_u32 v10, s4, v4, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, -1, v5, s4
+; CHECK-NEXT: s_mov_b32 s8, 0
+; CHECK-NEXT: v_add_co_u32 v4, s4, v2, v10
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, v3, v11, s4
+; CHECK-NEXT: v_add_co_u32 v10, s4, v0, v10
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s4, v1, v11, s4
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB1_12: ; %memmove_bwd_residual_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: global_load_ubyte v12, v[4:5], off
+; CHECK-NEXT: v_add_co_u32 v8, s4, v8, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, s4, -1, v9, s4
+; CHECK-NEXT: v_add_co_u32 v4, s4, v4, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v5, s4, -1, v5, s4
+; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[8:9]
+; CHECK-NEXT: s_or_b32 s8, s4, s8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[10:11], v12
+; CHECK-NEXT: v_add_co_u32 v10, s5, v10, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v11, s5, -1, v11, s5
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: s_cbranch_execnz .LBB1_12
+; CHECK-NEXT: .LBB1_13: ; %Flow40
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
+; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
+; CHECK-NEXT: s_cbranch_execz .LBB1_16
+; CHECK-NEXT: ; %bb.14: ; %memmove_bwd_main_loop.preheader
+; CHECK-NEXT: v_lshlrev_b64 v[4:5], 4, v[6:7]
+; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: v_add_co_u32 v4, vcc_lo, v4, -16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB1_15: ; %memmove_bwd_main_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
+; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v6, -1
+; CHECK-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v7, vcc_lo
+; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, -16
+; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; CHECK-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[6:7]
+; CHECK-NEXT: s_or_b32 s7, vcc_lo, s7
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; CHECK-NEXT: v_add_co_u32 v0, s4, v0, -16
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s4, -1, v1, s4
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s7
+; CHECK-NEXT: s_cbranch_execnz .LBB1_15
+; CHECK-NEXT: .LBB1_16: ; %Flow38
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memmove.p0.p1.i64(ptr addrspace(0) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 %sz, i1 false)
+ ret void
+}
+
+define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align 1 readonly %src, i64 %sz) {
+; CHECK-LABEL: memmove_p0_p3:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v7, 7, v3
+; CHECK-NEXT: v_mov_b32_e32 v8, 0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
+; CHECK-NEXT: s_mov_b64 s[4:5], src_shared_base
+; CHECK-NEXT: v_lshrrev_b64 v[5:6], 3, v[3:4]
+; CHECK-NEXT: s_mov_b32 s6, exec_lo
+; CHECK-NEXT: v_cmp_ne_u64_e64 s4, 0, v[7:8]
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, s5, vcc_lo
+; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v2, vcc_lo
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[5:6]
+; CHECK-NEXT: v_cmpx_ge_u64_e64 v[9:10], v[0:1]
+; CHECK-NEXT: s_xor_b32 s7, exec_lo, s6
+; CHECK-NEXT: s_cbranch_execnz .LBB2_3
+; CHECK-NEXT: ; %bb.1: ; %Flow40
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
+; CHECK-NEXT: s_cbranch_execnz .LBB2_10
+; CHECK-NEXT: .LBB2_2: ; %Flow41
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB2_3: ; %memmove_copy_forward
+; CHECK-NEXT: s_and_saveexec_b32 s8, vcc_lo
+; CHECK-NEXT: s_cbranch_execz .LBB2_6
+; CHECK-NEXT: ; %bb.4: ; %memmove_fwd_main_loop.preheader
+; CHECK-NEXT: v_mov_b32_e32 v10, v1
+; CHECK-NEXT: v_mov_b32_e32 v9, v0
+; CHECK-NEXT: v_mov_b32_e32 v11, v2
+; CHECK-NEXT: s_mov_b32 s9, 0
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB2_5: ; %memmove_fwd_main_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ds_read_b64 v[12:13], v11
+; CHECK-NEXT: v_add_co_u32 v5, s5, v5, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v6, s5, -1, v6, s5
+; CHECK-NEXT: v_add_nc_u32_e32 v11, 8, v11
+; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[5:6]
+; CHECK-NEXT: s_or_b32 s9, s5, s9
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[9:10], v[12:13]
+; CHECK-NEXT: v_add_co_u32 v9, s6, v9, 8
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v10, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; CHECK-NEXT: s_cbranch_execnz .LBB2_5
+; CHECK-NEXT: .LBB2_6: ; %Flow35
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: s_and_saveexec_b32 s8, s4
+; CHECK-NEXT: s_cbranch_execz .LBB2_9
+; CHECK-NEXT: ; %bb.7: ; %memmove_fwd_residual_loop.preheader
+; CHECK-NEXT: v_and_b32_e32 v5, -8, v3
+; CHECK-NEXT: v_sub_co_u32 v3, s5, v3, v7
+; CHECK-NEXT: v_subrev_co_ci_u32_e64 v4, s5, 0, v4, s5
+; CHECK-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; CHECK-NEXT: v_add_co_u32 v0, s5, v0, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s5, v1, v4, s5
+; CHECK-NEXT: s_mov_b32 s9, 0
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB2_8: ; %memmove_fwd_residual_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: v_add_co_u32 v7, s5, v7, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s5, -1, v8, s5
+; CHECK-NEXT: v_add_nc_u32_e32 v2, 1, v2
+; CHECK-NEXT: v_cmp_eq_u64_e64 s5, 0, v[7:8]
+; CHECK-NEXT: s_or_b32 s9, s5, s9
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: v_add_co_u32 v0, s6, v0, 1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v1, s6
+; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; CHECK-NEXT: s_cbranch_execnz .LBB2_8
+; CHECK-NEXT: .LBB2_9: ; %Flow33
+; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT: ; implicit-def: $vgpr3_vgpr4
+; CHECK-NEXT: ; implicit-def: $vgpr2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT: ; implicit-def: $vgpr7_vgpr8
+; CHECK-NEXT: s_andn2_saveexec_b32 s6, s7
+; CHECK-NEXT: s_cbranch_execz .LBB2_2
+; CHECK-NEXT: .LBB2_10: ; %memmove_copy_backwards
+; CHECK-NEXT: s_and_saveexec_b32 s7, s4
+; CHECK-NEXT: s_cbranch_execz .LBB2_13
+; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
+; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, v4, v1, s4
+; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
+; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, s4, -1, v10, s4
+; CHECK-NEXT: s_mov_b32 s8, 0
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB2_12: ; %memmove_bwd_residual_loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ds_read_u8 v11, v4
+; CHECK-NEXT: v_add_co_u32 v7, s4, v7, -1
+; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s4, -1, v8, s4
+; CHECK-NEXT: v_add_nc_u32_e32 v4, -1, v4
+; CHECK-NEXT: v_cmp_eq_u64_e64 s4, 0, v[7:8]
+; CHECK-NEXT: s_or_b32 s8, s4, s8
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[9:1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/112493
More information about the llvm-commits
mailing list