[llvm] 170c0da - [AMDGPU] Fix edge case of buffer OOB handling (#115479)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 23:56:47 PST 2025
Author: Piotr Sobczak
Date: 2025-03-07T08:56:44+01:00
New Revision: 170c0dac4488f9cfbc67e9593ebe6ad01cfa8f32
URL: https://github.com/llvm/llvm-project/commit/170c0dac4488f9cfbc67e9593ebe6ad01cfa8f32
DIFF: https://github.com/llvm/llvm-project/commit/170c0dac4488f9cfbc67e9593ebe6ad01cfa8f32.diff
LOG: [AMDGPU] Fix edge case of buffer OOB handling (#115479)
Strengthen out-of-bounds guarantees for buffer accesses by disallowing
buffer accesses with alignment lower than natural alignment.
This is needed to specifically address the edge case where an access
starts out-of-bounds and then enters in-bounds, as the hardware would
treat the entire access as being out-of-bounds. This is normally not
needed for most users, but at least one graphics device extension
(VK_EXT_robustness2) has very strict requirements - in-bounds accesses
must return correct value, and out-of-bounds accesses must return zero.
The direct consequence of the patch is that a buffer access at negative
address is not merged by load-store-vectorizer with one at a positive
address, which fixes a CTS test.
Targets that do not care about the new behavior are advised to use the
new target feature relaxed-buffer-oob-mode that maintains the state from
before the patch.
Added:
llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll
llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index effc8d2ed6b49..22b519898f6bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -119,6 +119,12 @@ def FeatureUnalignedDSAccess : SubtargetFeature<"unaligned-ds-access",
"Hardware supports unaligned local and region loads and stores"
>;
+def FeatureRelaxedBufferOOBMode : SubtargetFeature<"relaxed-buffer-oob-mode",
+ "RelaxedBufferOOBMode",
+ "true",
+ "Disable strict out-of-bounds buffer guarantees. An OOB access may potentially cause an adjacent access to be treated as if it were also OOB"
+>;
+
def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
"HasApertureRegs",
"true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 6664a70572ded..e6f02a4eeaac8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -78,6 +78,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool BackOffBarrier = false;
bool UnalignedScratchAccess = false;
bool UnalignedAccessMode = false;
+ bool RelaxedBufferOOBMode = false;
bool HasApertureRegs = false;
bool SupportsXNACK = false;
bool KernargPreload = false;
@@ -607,6 +608,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return UnalignedAccessMode;
}
+ bool hasRelaxedBufferOOBMode() const { return RelaxedBufferOOBMode; }
+
bool hasApertureRegs() const {
return HasApertureRegs;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index fe095414e5172..f930b5eac6953 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1883,6 +1883,20 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
Subtarget->hasUnalignedBufferAccessEnabled();
}
+ // Ensure robust out-of-bounds guarantees for buffer accesses are met if
+ // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
+ // out-of-bounds behavior, but in the edge case where an access starts
+ // out-of-bounds and then enter in-bounds, the entire access would be treated
+ // as out-of-bounds. Prevent misaligned memory accesses by requiring the
+ // natural alignment of buffer accesses.
+ if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
+ AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
+ AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
+ if (!Subtarget->hasRelaxedBufferOOBMode() &&
+ Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
+ return false;
+ }
+
// Smaller than dword value must be aligned.
if (Size < 32)
return false;
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
new file mode 100644
index 0000000000000..72c6010a5a80b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-buffer.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GISEL %s
+
+; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the underaligned loads and stores get split.
+; FIXME: The loads/stores do not get split (extend amdgpu-lower-buffer-fat-pointers?).
+
+define amdgpu_ps void @split_underaligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
+; CHECK-LABEL: split_underaligned_load:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s9
+; CHECK-NEXT: s_mov_b32 s15, s8
+; CHECK-NEXT: s_mov_b32 s14, s7
+; CHECK-NEXT: s_mov_b32 s13, s6
+; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; CHECK-NEXT: s_mov_b32 s12, s5
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: split_underaligned_load:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SDAG-NEXT: v_mov_b32_e32 v2, s9
+; SDAG-NEXT: s_mov_b32 s15, s8
+; SDAG-NEXT: s_mov_b32 s14, s7
+; SDAG-NEXT: s_mov_b32 s13, s6
+; SDAG-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; SDAG-NEXT: s_mov_b32 s12, s5
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: split_underaligned_load:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v2, s9
+; GISEL-NEXT: s_mov_b32 s12, s5
+; GISEL-NEXT: s_mov_b32 s13, s6
+; GISEL-NEXT: s_mov_b32 s14, s7
+; GISEL-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; GISEL-NEXT: s_mov_b32 s15, s8
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld = load i64, ptr addrspace(7) %gep, align 4
+
+ %gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
+ store i64 %ld, ptr addrspace(7) %gep2, align 4
+ ret void
+}
+
+; Check that in strict OOB mode for buffers (relaxed-buffer-oob-mode attribute not set) the naturally aligned loads and stores do not get split.
+
+define amdgpu_ps void @do_not_split_aligned_load(ptr addrspace(7) inreg %p, ptr addrspace(7) inreg %p2) #0 {
+; CHECK-LABEL: do_not_split_aligned_load:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s9
+; CHECK-NEXT: s_mov_b32 s15, s8
+; CHECK-NEXT: s_mov_b32 s14, s7
+; CHECK-NEXT: s_mov_b32 s13, s6
+; CHECK-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; CHECK-NEXT: s_mov_b32 s12, s5
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: do_not_split_aligned_load:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SDAG-NEXT: v_mov_b32_e32 v2, s9
+; SDAG-NEXT: s_mov_b32 s15, s8
+; SDAG-NEXT: s_mov_b32 s14, s7
+; SDAG-NEXT: s_mov_b32 s13, s6
+; SDAG-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; SDAG-NEXT: s_mov_b32 s12, s5
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: do_not_split_aligned_load:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v2, s9
+; GISEL-NEXT: s_mov_b32 s12, s5
+; GISEL-NEXT: s_mov_b32 s13, s6
+; GISEL-NEXT: s_mov_b32 s14, s7
+; GISEL-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen
+; GISEL-NEXT: s_mov_b32 s15, s8
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_store_b64 v[0:1], v2, s[12:15], 0 offen
+; GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld = load i64, ptr addrspace(7) %gep, align 8
+
+ %gep2 = getelementptr i8, ptr addrspace(7) %p2, i32 0
+ store i64 %ld, ptr addrspace(7) %gep2, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll b/llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll
index 4aab097229a47..f34617e6efc55 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-buffer-fat-pointer.ll
@@ -7,11 +7,11 @@ entry:
%a2 = getelementptr i32, ptr addrspace(7) %out, i32 2
%a3 = getelementptr i32, ptr addrspace(7) %out, i32 3
-; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 4
- store i32 0, ptr addrspace(7) %out
- store i32 1, ptr addrspace(7) %a1
- store i32 2, ptr addrspace(7) %a2
- store i32 3, ptr addrspace(7) %a3
+; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(7) %out, align 16
+ store i32 0, ptr addrspace(7) %out, align 16
+ store i32 1, ptr addrspace(7) %a1, align 4
+ store i32 2, ptr addrspace(7) %a2, align 8
+ store i32 3, ptr addrspace(7) %a3, align 4
ret void
}
@@ -22,10 +22,10 @@ entry:
%a2 = getelementptr i32, ptr addrspace(9) %out, i32 2
%a3 = getelementptr i32, ptr addrspace(9) %out, i32 3
-; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 4
- store i32 0, ptr addrspace(9) %out
- store i32 1, ptr addrspace(9) %a1
- store i32 2, ptr addrspace(9) %a2
- store i32 3, ptr addrspace(9) %a3
+; OPT: store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr addrspace(9) %out, align 16
+ store i32 0, ptr addrspace(9) %out, align 16
+ store i32 1, ptr addrspace(9) %a1, align 4
+ store i32 2, ptr addrspace(9) %a2, align 8
+ store i32 3, ptr addrspace(9) %a3, align 4
ret void
}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index 07958f1c1a296..ede2e4066c263 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -1,4 +1,5 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-RELAXED %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefixes=CHECK,CHECK-OOB-STRICT %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
@@ -72,9 +73,14 @@ entry:
ret void
}
-; CHECK-LABEL: @merge_fat_ptrs(
-; CHECK: load <4 x i16>
-; CHECK: store <4 x i16> zeroinitializer
+; CHECK-OOB-RELAXED-LABEL: @merge_fat_ptrs(
+; CHECK-OOB-RELAXED: load <4 x i16>
+; CHECK-OOB-RELAXED: store <4 x i16> zeroinitializer
+; CHECK-OOB-STRICT-LABEL: @merge_fat_ptrs(
+; CHECK-OOB-STRICT: load <2 x i16>
+; CHECK-OOB-STRICT: load <2 x i16>
+; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
+; CHECK-OOB-STRICT: store <2 x i16> zeroinitializer
define amdgpu_kernel void @merge_fat_ptrs(ptr addrspace(7) nocapture %a, ptr addrspace(7) nocapture readonly %b) #0 {
entry:
%a.1 = getelementptr inbounds <2 x i16>, ptr addrspace(7) %a, i32 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
new file mode 100644
index 0000000000000..d590a4a403fb7
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/unaligned-buffer.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -S -o - %s | FileCheck --check-prefix=OOB-STRICT %s
+; RUN: opt -mtriple=amdgcn--amdpal -passes=load-store-vectorizer -mattr=+relaxed-buffer-oob-mode -S -o - %s | FileCheck --check-prefixes=OOB-RELAXED %s
+
+; The test checks that relaxed-buffer-oob-mode allows merging loads even if the target load is not naturally aligned.
+
+define amdgpu_kernel void @merge_align_4(ptr addrspace(7) captures(none) %p) #0 {
+;
+; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_4(
+; OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
+; OOB-STRICT-NEXT: [[ENTRY:.*:]]
+; OOB-STRICT-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; OOB-STRICT-NEXT: [[LD_M8:%.*]] = load i32, ptr addrspace(7) [[GEP_M8]], align 4
+; OOB-STRICT-NEXT: [[GEP_M4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -4
+; OOB-STRICT-NEXT: [[LD_M4:%.*]] = load i32, ptr addrspace(7) [[GEP_M4]], align 4
+; OOB-STRICT-NEXT: [[GEP_0:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 0
+; OOB-STRICT-NEXT: [[LD_0:%.*]] = load i32, ptr addrspace(7) [[GEP_0]], align 4
+; OOB-STRICT-NEXT: [[GEP_4:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i64 4
+; OOB-STRICT-NEXT: [[LD_4:%.*]] = load i32, ptr addrspace(7) [[GEP_4]], align 4
+; OOB-STRICT-NEXT: ret void
+;
+; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_4(
+; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
+; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 4
+; OOB-RELAXED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; OOB-RELAXED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; OOB-RELAXED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; OOB-RELAXED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; OOB-RELAXED-NEXT: ret void
+;
+entry:
+ %gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
+ %ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 4
+ %gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
+ %ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
+ %gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld_0 = load i32, ptr addrspace(7) %gep_0, align 4
+ %gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
+ %ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
+ ret void
+}
+
+; The test checks that strict OOB mode (relaxed-buffer-oob-mode not set) allows merging loads if the target load is naturally aligned.
+
+define amdgpu_kernel void @merge_align_16(ptr addrspace(7) captures(none) %p) #0 {
+; OOB-STRICT-LABEL: define amdgpu_kernel void @merge_align_16(
+; OOB-STRICT-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) {
+; OOB-STRICT-NEXT: [[ENTRY:.*:]]
+; OOB-STRICT-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; OOB-STRICT-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
+; OOB-STRICT-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; OOB-STRICT-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; OOB-STRICT-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; OOB-STRICT-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; OOB-STRICT-NEXT: ret void
+;
+; OOB-RELAXED-LABEL: define amdgpu_kernel void @merge_align_16(
+; OOB-RELAXED-SAME: ptr addrspace(7) captures(none) [[P:%.*]]) #[[ATTR0]] {
+; OOB-RELAXED-NEXT: [[ENTRY:.*:]]
+; OOB-RELAXED-NEXT: [[GEP_M8:%.*]] = getelementptr i8, ptr addrspace(7) [[P]], i32 -8
+; OOB-RELAXED-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(7) [[GEP_M8]], align 16
+; OOB-RELAXED-NEXT: [[LD_M81:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; OOB-RELAXED-NEXT: [[LD_M42:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; OOB-RELAXED-NEXT: [[LD_03:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; OOB-RELAXED-NEXT: [[LD_44:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; OOB-RELAXED-NEXT: ret void
+;
+entry:
+ %gep_m8 = getelementptr i8, ptr addrspace(7) %p, i32 -8
+ %ld_m8 = load i32, ptr addrspace(7) %gep_m8, align 16
+ %gep_m4 = getelementptr i8, ptr addrspace(7) %p, i32 -4
+ %ld_m4 = load i32, ptr addrspace(7) %gep_m4, align 4
+ %gep_0 = getelementptr i8, ptr addrspace(7) %p, i32 0
+ %ld_0 = load i32, ptr addrspace(7) %gep_0, align 8
+ %gep_4 = getelementptr i8, ptr addrspace(7) %p, i64 4
+ %ld_4 = load i32, ptr addrspace(7) %gep_4, align 4
+ ret void
+}
More information about the llvm-commits
mailing list