[llvm] 2b43209 - [AMDGPU] Propagate LDS align into to instructions
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 23 00:57:51 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-06-23T00:57:16-07:00
New Revision: 2b43209ee37ce27cede17cf5424e7655adfe3ac2
URL: https://github.com/llvm/llvm-project/commit/2b43209ee37ce27cede17cf5424e7655adfe3ac2
DIFF: https://github.com/llvm/llvm-project/commit/2b43209ee37ce27cede17cf5424e7655adfe3ac2.diff
LOG: [AMDGPU] Propagate LDS align into to instructions
Differential Revision: https://reviews.llvm.org/D104316
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
llvm/test/CodeGen/AMDGPU/ds_read2.ll
llvm/test/CodeGen/AMDGPU/ds_write2.ll
llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 30acbc9110d5..a3a43bd8d407 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -309,6 +309,10 @@ class AMDGPULowerModuleLDS : public ModulePass {
UsedList.erase(GV);
GV->eraseFromParent();
}
+
+ uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
+ Align A = commonAlignment(StructAlign, Off);
+ refineUsesAlignment(GEP, A, DL);
}
// Mark kernels with asm that reads the address of the allocated structure
@@ -328,6 +332,46 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
return true;
}
+
+ void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
+ unsigned MaxDepth = 5) {
+ if (!MaxDepth)
+ return;
+
+ for (User *U : Ptr->users()) {
+ if (auto *LI = dyn_cast<LoadInst>(U)) {
+ LI->setAlignment(std::max(A, LI->getAlign()));
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(U)) {
+ SI->setAlignment(std::max(A, SI->getAlign()));
+ continue;
+ }
+ if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
+ AI->setAlignment(std::max(A, AI->getAlign()));
+ continue;
+ }
+ if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
+ AI->setAlignment(std::max(A, AI->getAlign()));
+ continue;
+ }
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
+ unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
+ APInt Off(BitWidth, 0);
+ if (GEP->getPointerOperand() == Ptr &&
+ GEP->accumulateConstantOffset(DL, Off)) {
+ Align GA = commonAlignment(A, Off.getLimitedValue());
+ refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+ }
+ continue;
+ }
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (I->getOpcode() == Instruction::BitCast ||
+ I->getOpcode() == Instruction::AddrSpaceCast)
+ refineUsesAlignment(I, A, DL, MaxDepth - 1);
+ }
+ }
+ }
};
} // namespace
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index e1270ba4f72e..6927b7d448a6 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1009,7 +1009,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
+; CI-NEXT: ds_read_b128 v[0:3], v0
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
@@ -1019,27 +1019,16 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
-; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
-; GFX9-ALIGNED: ; %bb.0:
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-ALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
-; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-ALIGNED-NEXT: s_endpgm
-;
-; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
-; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4
-; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-UNALIGNED-NEXT: s_endpgm
+; GFX9-LABEL: load_misaligned64_constant_offsets:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: ds_read_b128 v[0:3], v4
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
%sum = add i64 %val0, %val1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 71dc7f2ff44f..0630e1043575 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -818,33 +818,22 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; CI-LABEL: store_misaligned64_constant_offsets:
; CI: ; %bb.0:
-; CI-NEXT: s_movk_i32 s0, 0x7b
-; CI-NEXT: s_mov_b32 s1, 0
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v2, 0
-; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0x7b
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: v_mov_b32_e32 v2, v0
+; CI-NEXT: v_mov_b32_e32 v3, v1
; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
+; CI-NEXT: ds_write_b128 v1, v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
-; GFX9-ALIGNED: ; %bb.0:
-; GFX9-ALIGNED-NEXT: s_movk_i32 s0, 0x7b
-; GFX9-ALIGNED-NEXT: s_mov_b32 s1, 0
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-ALIGNED-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
-; GFX9-ALIGNED-NEXT: s_endpgm
-;
-; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
-; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3]
-; GFX9-UNALIGNED-NEXT: s_endpgm
+; GFX9-LABEL: store_misaligned64_constant_offsets:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: ds_write_b128 v1, v[0:3]
+; GFX9-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
index ee16e88293d7..4ff9aa3a9bec 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -25,11 +25,11 @@ define amdgpu_kernel void @k0() {
; CHECK-LABEL: @k0(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
@@ -53,9 +53,9 @@ define amdgpu_kernel void @k1() {
; CHECK-LABEL: @k1(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
@@ -101,9 +101,9 @@ define amdgpu_kernel void @1() {
define void @f0() {
; CHECK-LABEL: @f0(
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
; CHECK-NEXT: %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4
+; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8
; CHECK-NEXT: ret void
;
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
index 402a75f958a4..b50d75aa682a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @k3(i64 %x) {
; CHECK-NEXT: %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24
; CHECK-NEXT: %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)*
; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64*
-; CHECK-NEXT: store i64 2, i64* %ptr2, align 1
+; CHECK-NEXT: store i64 2, i64* %ptr2, align 8
; CHECK-NEXT: ret void
;
%ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
index 336dc0c95784..3ea52f9309f6 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-global-uses.ll
@@ -41,7 +41,7 @@
@llvm.compiler.used = appending global [3 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
; CHECK-LABEL: @k0()
-; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 2
+; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 4
; CHECK: %ld.lds.2 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 4
; CHECK: %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3, align 4
; CHECK: %ld.lds.4 = load float, float addrspace(3)* @lds.4, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
index 82b5d4f1e110..470177ba392a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -4,6 +4,8 @@
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s
; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [32 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i16, [2 x i8], i16 }
+; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] }
; CHECK-NOT: @lds.1
@lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
@@ -11,6 +13,10 @@
; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1
+; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
+; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16
+; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8
+
; CHECK-LABEL: @k1
; CHECK: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
; CHECK: %2 = addrspacecast i8 addrspace(3)* %1 to i8*
@@ -21,3 +27,103 @@ define amdgpu_kernel void @k1(i64 %x) {
store i8 1, i8 addrspace(0)* %ptr, align 1
ret void
}
+
+ at lds.2 = internal unnamed_addr addrspace(3) global i16 undef, align 4
+ at lds.3 = internal unnamed_addr addrspace(3) global i16 undef, align 4
+
+; Check that alignment is propagated to uses for scalar variables.
+
+; CHECK-LABEL: @k2
+; CHECK: store i16 1, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0), align 4
+; CHECK: store i16 2, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 2), align 4
+define amdgpu_kernel void @k2() {
+ store i16 1, i16 addrspace(3)* @lds.2, align 2
+ store i16 2, i16 addrspace(3)* @lds.3, align 2
+ ret void
+}
+
+ at lds.4 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
+ at lds.5 = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
+
+; Check that alignment is propagated to uses for arrays.
+
+; CHECK-LABEL: @k3
+; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 8
+; CHECK: store i32 2, i32 addrspace(3)* %ptr2, align 4
+; SUPER-ALIGN_ON: store i32 3, i32 addrspace(3)* %ptr3, align 16
+; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8
+; CHECK: store i32 4, i32 addrspace(3)* %ptr4, align 4
+; CHECK: store i32 5, i32 addrspace(3)* %ptr5, align 4
+; CHECK: %load1 = load i32, i32 addrspace(3)* %ptr1, align 8
+; CHECK: %load2 = load i32, i32 addrspace(3)* %ptr2, align 4
+; SUPER-ALIGN_ON: %load3 = load i32, i32 addrspace(3)* %ptr3, align 16
+; SUPER-ALIGN_OFF: %load3 = load i32, i32 addrspace(3)* %ptr3, align 8
+; CHECK: %load4 = load i32, i32 addrspace(3)* %ptr4, align 4
+; CHECK: %load5 = load i32, i32 addrspace(3)* %ptr5, align 4
+; CHECK: %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 8
+; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 8
+; CHECK: %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)*
+; CHECK: %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)*
+; CHECK: %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)*
+; CHECK: %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)*
+; CHECK: store i16 11, i16 addrspace(3)* %ptr1.bc, align 8
+; CHECK: store i16 12, i16 addrspace(3)* %ptr2.bc, align 4
+; SUPER-ALIGN_ON: store i16 13, i16 addrspace(3)* %ptr3.bc, align 16
+; SUPER-ALIGN_OFF: store i16 13, i16 addrspace(3)* %ptr3.bc, align 8
+; CHECK: store i16 14, i16 addrspace(3)* %ptr4.bc, align 4
+; CHECK: %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32*
+; CHECK: %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32*
+; CHECK: %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32*
+; CHECK: %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32*
+; CHECK: store i32 21, i32* %ptr1.ac, align 8
+; CHECK: store i32 22, i32* %ptr2.ac, align 4
+; SUPER-ALIGN_ON: store i32 23, i32* %ptr3.ac, align 16
+; SUPER-ALIGN_OFF: store i32 23, i32* %ptr3.ac, align 8
+; CHECK: store i32 24, i32* %ptr4.ac, align 4
+define amdgpu_kernel void @k3(i64 %x) {
+ %ptr0 = getelementptr inbounds i64, i64 addrspace(3)* bitcast ([32 x i64] addrspace(3)* @lds.4 to i64 addrspace(3)*), i64 0
+ store i64 0, i64 addrspace(3)* %ptr0, align 8
+
+ %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 2
+ %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 3
+ %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 4
+ %ptr4 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 5
+ %ptr5 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 %x
+
+ store i32 1, i32 addrspace(3)* %ptr1, align 4
+ store i32 2, i32 addrspace(3)* %ptr2, align 4
+ store i32 3, i32 addrspace(3)* %ptr3, align 4
+ store i32 4, i32 addrspace(3)* %ptr4, align 4
+ store i32 5, i32 addrspace(3)* %ptr5, align 4
+
+ %load1 = load i32, i32 addrspace(3)* %ptr1, align 4
+ %load2 = load i32, i32 addrspace(3)* %ptr2, align 4
+ %load3 = load i32, i32 addrspace(3)* %ptr3, align 4
+ %load4 = load i32, i32 addrspace(3)* %ptr4, align 4
+ %load5 = load i32, i32 addrspace(3)* %ptr5, align 4
+
+ %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 4
+ %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 4
+
+ %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)*
+ %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)*
+ %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)*
+ %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)*
+
+ store i16 11, i16 addrspace(3)* %ptr1.bc, align 2
+ store i16 12, i16 addrspace(3)* %ptr2.bc, align 2
+ store i16 13, i16 addrspace(3)* %ptr3.bc, align 2
+ store i16 14, i16 addrspace(3)* %ptr4.bc, align 2
+
+ %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32*
+ %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32*
+ %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32*
+ %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32*
+
+ store i32 21, i32* %ptr1.ac, align 4
+ store i32 22, i32* %ptr2.ac, align 4
+ store i32 23, i32* %ptr3.ac, align 4
+ store i32 24, i32* %ptr4.ac, align 4
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
index 49b912c83978..d22e417c90bf 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
@@ -18,11 +18,11 @@
define amdgpu_kernel void @k0() {
; CHECK-LABEL: @k0(
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 2
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
@@ -45,9 +45,9 @@ define amdgpu_kernel void @k0() {
define amdgpu_kernel void @k1() {
; CHECK-LABEL: @k1(
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
+; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
-; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
+; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
index ccd31c3c2263..20c84422757e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -29,7 +29,7 @@
@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
; CHECK-LABEL: @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 8
define void @func() {
%dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
%unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
index 84265edf4194..e92d8624a4bd 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -24,9 +24,9 @@
; Use in func rewritten to access struct at address zero
; CHECK-LABEL: @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0
-; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
+; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 8
; CHECK: %val1 = add i32 %val0, 4
-; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
+; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 8
; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
define void @func() {
%dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
@@ -41,7 +41,7 @@ define void @func() {
; CHECK-LABEL: @kern_call()
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK: call void @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8
define amdgpu_kernel void @kern_call() {
call void @func()
%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll
index de439da28e77..e571fd8587e4 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-lds-offsets.ll
@@ -73,7 +73,7 @@
; LOWER_LDS-LABEL: @f1
-; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
+; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 16
; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1
; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
; LOWER_LDS: store i32 7, i32 addrspace(3)* %3, align 4
@@ -153,7 +153,7 @@ define void @f2() {
; LOWER_LDS: %4 = ptrtoint i64 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i16
; LOWER_LDS: store i16 %4, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2
; LOWER_LDS: %5 = ptrtoint i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i16
-; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
+; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 16
; LOWER_LDS: br label %6
;
; LOWER_LDS-LABEL: 6:
@@ -177,11 +177,9 @@ define void @f2() {
; GCN: s_mov_b32 s32, 0
; GCN: s_and_saveexec_b64 s[0:1], vcc
; GCN: s_cbranch_execz BB2_2
-; GCN: v_mov_b32_e32 v0, 24
-; GCN: v_mov_b32_e32 v1, 0
-; GCN: ds_write_b16 v1, v0 offset:18
-; GCN: v_mov_b32_e32 v0, 32
-; GCN: ds_write_b16 v1, v0 offset:16
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, 0x180020
+; GCN: ds_write_b32 v0, v1 offset:16
; GCN-LABEL: BB2_2:
; GCN: s_or_b64 exec, exec, s[0:1]
; GCN: s_getpc_b64 s[0:1]
More information about the llvm-commits
mailing list