[llvm] b89236a - [AMDGPU] Vectorize misaligned global loads & stores
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 3 13:19:05 PST 2023
Author: Jeffrey Byrnes
Date: 2023-03-03T13:18:25-08:00
New Revision: b89236a96f2f2f3e9b88d198585a8eda7fb2c443
URL: https://github.com/llvm/llvm-project/commit/b89236a96f2f2f3e9b88d198585a8eda7fb2c443
DIFF: https://github.com/llvm/llvm-project/commit/b89236a96f2f2f3e9b88d198585a8eda7fb2c443.diff
LOG: [AMDGPU] Vectorize misaligned global loads & stores
Based on experimentation on gfx906,908,90a and 1030, wider global loads / stores are more performant than multiple narrower ones independent of alignment -- this is especially true when combining 8 bit loads / stores, in which case speedup was usually 2x across all alignments.
Differential Revision: https://reviews.llvm.org/D145170
Change-Id: I6ee6c76e6ace7fc373cc1b2aac3818fc1425a0c1
Added:
llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
llvm/test/CodeGen/AMDGPU/load-global-i16.ll
llvm/test/CodeGen/AMDGPU/udiv.ll
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index a345b9dc043d8..50c7acdd82189 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -427,6 +427,12 @@ inline bool isFlatGlobalAddrSpace(unsigned AS) {
AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
}
+
+inline bool isExtendedGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+}
}
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 73d1c2ee36f79..e8a79ac5174d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1546,18 +1546,14 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccessEnabled()) {
- // If we have a uniform constant load, it still requires using a slow
- // buffer instruction if unaligned.
- if (IsFast) {
- // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
- // 2-byte alignment is worse than 1 unless doing a 2-byte access.
- *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
- AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
- Alignment >= Align(4) : Alignment != Align(2);
- }
+ // So long as they are correct, wide global memory operations perform better
+ // than multiple smaller memory ops -- even when misaligned
+ if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
+ if (IsFast)
+ *IsFast = Size;
- return true;
+ return Alignment >= Align(4) ||
+ Subtarget->hasUnalignedBufferAccessEnabled();
}
// Smaller than dword value must be aligned.
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index f1542f53461e8..8a38e088e7a70 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -23,45 +23,31 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
-; GFX7-UNALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-UNALIGNED-NEXT: flat_load_ushort v2, v[2:3]
-; GFX7-UNALIGNED-NEXT: flat_load_ushort v0, v[0:1]
-; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1]
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: global_load_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: global_load_2xi16_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_ushort v2, v[0:1], off
-; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_load_2xi16_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_u16 v2, v[0:1], off
-; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
%p.0 = load i16, ptr addrspace(1) %p, align 2
@@ -94,50 +80,37 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 1
+; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX7-UNALIGNED-NEXT: s_add_u32 s2, s0, 2
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-UNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2
-; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
-; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 2
-; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
-; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2
+; GFX7-UNALIGNED-NEXT: flat_store_dword v[0:1], v2
; GFX7-UNALIGNED-NEXT: s_endpgm
;
; GFX9-LABEL: global_store_2xi16_align2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x20001
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short v0, v2, s[0:1] offset:2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_store_2xi16_align2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x20001
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_store_2xi16_align2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1
-; GFX11-NEXT: v_mov_b32_e32 v2, 2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] offset:2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
new file mode 100644
index 0000000000000..b8ecbae3d3114
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1030 %s
+
+
+; Function Attrs: mustprogress nounwind willreturn
+define amdgpu_kernel void @half8(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
+; GFX908-LABEL: half8:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: half8:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX1030-LABEL: half8:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1030-NEXT: v_mov_b32_e32 v4, 0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX1030-NEXT: s_endpgm
+ %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
+ %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
+ %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
+ %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
+ %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
+ %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
+ %gep6 = getelementptr half, ptr addrspace(1) %0, i64 6
+ %gep7 = getelementptr half, ptr addrspace(1) %0, i64 7
+ %l0 = load half, ptr addrspace(1) %gep0, align 2
+ %l1 = load half, ptr addrspace(1) %gep1, align 2
+ %l2 = load half, ptr addrspace(1) %gep2, align 2
+ %l3 = load half, ptr addrspace(1) %gep3, align 2
+ %l4 = load half, ptr addrspace(1) %gep4, align 2
+ %l5 = load half, ptr addrspace(1) %gep5, align 2
+ %l6 = load half, ptr addrspace(1) %gep6, align 2
+ %l7 = load half, ptr addrspace(1) %gep7, align 2
+ %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
+ %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
+ %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
+ %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
+ %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
+ %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
+ %sgep6 = getelementptr half, ptr addrspace(1) %1, i64 6
+ %sgep7 = getelementptr half, ptr addrspace(1) %1, i64 7
+ store half %l0, ptr addrspace(1) %sgep0, align 2
+ store half %l1, ptr addrspace(1) %sgep1, align 2
+ store half %l2, ptr addrspace(1) %sgep2, align 2
+ store half %l3, ptr addrspace(1) %sgep3, align 2
+ store half %l4, ptr addrspace(1) %sgep4, align 2
+ store half %l5, ptr addrspace(1) %sgep5, align 2
+ store half %l6, ptr addrspace(1) %sgep6, align 2
+ store half %l7, ptr addrspace(1) %sgep7, align 2
+ ret void
+}
+
+; Function Attrs: mustprogress nounwind willreturn
+define amdgpu_kernel void @half6(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
+; GFX908-LABEL: half6:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v3, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: half6:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX1030-LABEL: half6:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1030-NEXT: v_mov_b32_e32 v3, 0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
+; GFX1030-NEXT: s_endpgm
+ %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
+ %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
+ %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
+ %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
+ %gep4 = getelementptr half, ptr addrspace(1) %0, i64 4
+ %gep5 = getelementptr half, ptr addrspace(1) %0, i64 5
+ %l0 = load half, ptr addrspace(1) %gep0, align 1
+ %l1 = load half, ptr addrspace(1) %gep1, align 1
+ %l2 = load half, ptr addrspace(1) %gep2, align 1
+ %l3 = load half, ptr addrspace(1) %gep3, align 1
+ %l4 = load half, ptr addrspace(1) %gep4, align 1
+ %l5 = load half, ptr addrspace(1) %gep5, align 1
+ %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
+ %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
+ %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
+ %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
+ %sgep4 = getelementptr half, ptr addrspace(1) %1, i64 4
+ %sgep5 = getelementptr half, ptr addrspace(1) %1, i64 5
+ store half %l0, ptr addrspace(1) %sgep0, align 1
+ store half %l1, ptr addrspace(1) %sgep1, align 1
+ store half %l2, ptr addrspace(1) %sgep2, align 1
+ store half %l3, ptr addrspace(1) %sgep3, align 1
+ store half %l4, ptr addrspace(1) %sgep4, align 1
+ store half %l5, ptr addrspace(1) %sgep5, align 1
+ ret void
+}
+
+; Function Attrs: mustprogress nounwind willreturn
+define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
+; GFX908-LABEL: half4:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s0
+; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: half4:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX1030-LABEL: half4:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1030-NEXT: v_mov_b32_e32 v2, 0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX1030-NEXT: s_endpgm
+ %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
+ %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
+ %gep2 = getelementptr half, ptr addrspace(1) %0, i64 2
+ %gep3 = getelementptr half, ptr addrspace(1) %0, i64 3
+ %l0 = load half, ptr addrspace(1) %gep0, align 4
+ %l1 = load half, ptr addrspace(1) %gep1, align 4
+ %l2 = load half, ptr addrspace(1) %gep2, align 4
+ %l3 = load half, ptr addrspace(1) %gep3, align 4
+ %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
+ %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
+ %sgep2 = getelementptr half, ptr addrspace(1) %1, i64 2
+ %sgep3 = getelementptr half, ptr addrspace(1) %1, i64 3
+ store half %l0, ptr addrspace(1) %sgep0, align 4
+ store half %l1, ptr addrspace(1) %sgep1, align 4
+ store half %l2, ptr addrspace(1) %sgep2, align 4
+ store half %l3, ptr addrspace(1) %sgep3, align 4
+ ret void
+}
+
+
+; Function Attrs: mustprogress nounwind willreturn
+define amdgpu_kernel void @half2(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture writeonly %1) local_unnamed_addr #0 {
+; GFX908-LABEL: half2:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: half2:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX1030-LABEL: half2:
+; GFX1030: ; %bb.0:
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX1030-NEXT: v_mov_b32_e32 v0, 0
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX1030-NEXT: s_endpgm
+ %gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
+ %gep1 = getelementptr half, ptr addrspace(1) %0, i64 1
+ %l0 = load half, ptr addrspace(1) %gep0
+ %l1 = load half, ptr addrspace(1) %gep1
+ %sgep0 = getelementptr half, ptr addrspace(1) %1, i64 0
+ %sgep1 = getelementptr half, ptr addrspace(1) %1, i64 1
+ store half %l0, ptr addrspace(1) %sgep0
+ store half %l1, ptr addrspace(1) %sgep1
+ ret void
+}
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 971ae8ea46d75..adfe7c49dd6fa 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -503,12 +503,12 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 0c433240f5f95..227c2f50ca218 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -674,7 +674,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
@@ -682,11 +684,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT: s_add_u32 s0, s2, 16
-; GCN-HSA-NEXT: s_addc_u32 s1, s3, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 92fc6efa45eaa..a5b1fa844e0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -1571,21 +1571,15 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_add_u32 s4, s2, 2
-; GCN-NEXT: s_addc_u32 s5, s3, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: flat_load_ushort v2, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: flat_load_ushort v0, v[0:1]
+; GCN-NEXT: flat_load_dword v0, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0
-; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GCN-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2
; GCN-NEXT: v_mul_f32_e32 v4, v3, v4
; GCN-NEXT: v_trunc_f32_e32 v4, v4
; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4
@@ -1601,19 +1595,16 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v0, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] offset:2
-; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX1030-NEXT: s_waitcnt vmcnt(1)
-; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1
-; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX1030-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX1030-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2
+; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3
; GFX1030-NEXT: v_trunc_f32_e32 v3, v3
-; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2
+; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1
+; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index 38cb6c9bc3ed2..3e93555f29d0a 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck %s
; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
@@ -59,9 +58,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_0_i16(ptr addrspace(1)
define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(ptr addrspace(1) %out) #0 {
; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align(
-; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i16, ptr addrspace(1) [[OUT:%.*]], i32 1
-; CHECK-NEXT: store i16 123, ptr addrspace(1) [[OUT_GEP_1]], align 2
-; CHECK-NEXT: store i16 456, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, ptr addrspace(1) [[OUT:%.*]], align 2
; CHECK-NEXT: ret void
;
%out.gep.1 = getelementptr i16, ptr addrspace(1) %out, i32 1
@@ -85,9 +82,7 @@ define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(ptr addrsp
define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(ptr addrspace(1) %out) #0 {
; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align(
-; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr half, ptr addrspace(1) [[OUT:%.*]], i32 1
-; CHECK-NEXT: store half 0xH4000, ptr addrspace(1) [[OUT_GEP_1]], align 2
-; CHECK-NEXT: store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT: store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT:%.*]], align 2
; CHECK-NEXT: ret void
;
%out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
More information about the llvm-commits
mailing list