[llvm] AtomicExpand: Fix creating invalid ptrmask for fat pointers (PR #94955)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 10 03:51:03 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/94955
>From 8f757504c4fcbbe8a80c1a81bf59bf10b03b873c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 10 Jun 2024 12:29:34 +0200
Subject: [PATCH 1/2] AtomicExpand: Fix creating invalid ptrmask for fat
pointers
The ptrmask intrinsic requires the integer mask to be the index
size, not the pointer size.
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 2 +-
.../AtomicExpand/AMDGPU/expand-atomic-i16.ll | 104 ++++++++++++++
.../AtomicExpand/AMDGPU/expand-atomic-i8.ll | 104 ++++++++++++++
.../AMDGPU/expand-atomic-rmw-fadd.ll | 130 ++++++++++++++++++
4 files changed, 339 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d2b756e82964e..7728cc50fc9f9 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -765,7 +765,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
assert(ValueSize < MinWordSize);
PointerType *PtrTy = cast<PointerType>(Addr->getType());
- IntegerType *IntTy = DL.getIntPtrType(Ctx, PtrTy->getAddressSpace());
+ IntegerType *IntTy = DL.getIndexType(Ctx, PtrTy->getAddressSpace());
Value *PtrLSB;
if (AddrAlign < MinWordSize) {
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index 0acb8f8d0fcf6..b8196cfcc3510 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1262,6 +1262,110 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt
ret bfloat %res
}
+define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED]]
+;
+ %res = atomicrmw xchg ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_xchg_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i16_buffer_fat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP4]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED]]
+;
+ %res = atomicrmw xchg ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_add_i16_buffer_fat_agent(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i16_buffer_fat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED]]
+;
+ %res = atomicrmw add ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_add_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i16_buffer_fat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[NEW]], 65535
+; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED]]
+;
+ %res = atomicrmw add ptr addrspace(7) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
!0 = !{}
!1 = !{!"foo", !"bar"}
!2 = !{!3}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index 97651c8d23a1e..590ee63001615 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1608,3 +1608,107 @@ define i8 @test_atomicrmw_dec_i8_flat_agent_align4(ptr %ptr, i8 %value) {
%res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
ret i8 %res
}
+
+define i8 @test_atomicrmw_xchg_i8_buffer_fat_agent(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i8_buffer_fat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT: [[TMP7:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED]]
+;
+ %res = atomicrmw xchg ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_xchg_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_xchg_i8_buffer_fat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP4]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED]]
+;
+ %res = atomicrmw xchg ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_add_i8_buffer_fat_agent(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i8_buffer_fat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED]]
+;
+ %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_add_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_add_i8_buffer_fat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(7) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[NEW:%.*]] = add i32 [[LOADED]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[NEW]], 255
+; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[PTR]], i32 [[LOADED]], i32 [[TMP5]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED]]
+;
+ %res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 17318b2c62ca8..34c6cdfc8d9c1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -4669,6 +4669,136 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr,
ret void
}
+define half @buffer_atomicrmw_fadd_f16_agent(ptr addrspace(7) %ptr, half %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_f16_agent(
+; ALL-NEXT: [[P:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[P]], i32 -4)
+; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[P]] to i32
+; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
+; ALL: atomicrmw.start:
+; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP4]], [[F:%.*]]
+; ALL-NEXT: [[TMP5:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL: atomicrmw.end:
+; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
+; ALL-NEXT: ret half [[TMP7]]
+;
+ %p = getelementptr half, ptr addrspace(7) %ptr, i32 4
+ %fadd = atomicrmw fadd ptr addrspace(7) %p, half %f syncscope("agent") seq_cst
+ ret half %fadd
+}
+
+define half @buffer_atomicrmw_fadd_f16_align4_agent(ptr addrspace(7) %ptr, half %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_f16_align4_agent(
+; ALL-NEXT: [[P:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(7) [[P]], align 4
+; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
+; ALL: atomicrmw.start:
+; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to half
+; ALL-NEXT: [[NEW:%.*]] = fadd half [[TMP2]], [[F:%.*]]
+; ALL-NEXT: [[TMP3:%.*]] = bitcast half [[NEW]] to i16
+; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[P]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL: atomicrmw.end:
+; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to half
+; ALL-NEXT: ret half [[TMP5]]
+;
+ %p = getelementptr half, ptr addrspace(7) %ptr, i32 4
+ %fadd = atomicrmw fadd ptr addrspace(7) %p, half %f syncscope("agent") seq_cst, align 4
+ ret half %fadd
+}
+
+define bfloat @buffer_atomicrmw_fadd_bf16_agent(ptr addrspace(7) %ptr, bfloat %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_bf16_agent(
+; ALL-NEXT: [[P:%.*]] = getelementptr bfloat, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) [[P]], i32 -4)
+; ALL-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(7) [[P]] to i32
+; ALL-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; ALL-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; ALL-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; ALL-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; ALL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(7) [[ALIGNEDADDR]], align 4
+; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
+; ALL: atomicrmw.start:
+; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; ALL-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[F:%.*]]
+; ALL-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
+; ALL-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; ALL-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(7) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL: atomicrmw.end:
+; ALL-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; ALL-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; ALL-NEXT: [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
+; ALL-NEXT: ret bfloat [[TMP7]]
+;
+ %p = getelementptr bfloat, ptr addrspace(7) %ptr, i32 4
+ %fadd = atomicrmw fadd ptr addrspace(7) %p, bfloat %f syncscope("agent") seq_cst
+ ret bfloat %fadd
+}
+
+define bfloat @buffer_atomicrmw_fadd_bf16_align4_agent(ptr addrspace(7) %ptr, bfloat %f) {
+; ALL-LABEL: @buffer_atomicrmw_fadd_bf16_align4_agent(
+; ALL-NEXT: [[P:%.*]] = getelementptr bfloat, ptr addrspace(7) [[PTR:%.*]], i32 4
+; ALL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(7) [[P]], align 4
+; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
+; ALL: atomicrmw.start:
+; ALL-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; ALL-NEXT: [[TMP2:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
+; ALL-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP2]], [[F:%.*]]
+; ALL-NEXT: [[TMP3:%.*]] = bitcast bfloat [[NEW]] to i16
+; ALL-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP3]] to i32
+; ALL-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; ALL-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(7) [[P]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL: atomicrmw.end:
+; ALL-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; ALL-NEXT: [[TMP5:%.*]] = bitcast i16 [[EXTRACTED1]] to bfloat
+; ALL-NEXT: ret bfloat [[TMP5]]
+;
+ %p = getelementptr bfloat, ptr addrspace(7) %ptr, i32 4
+ %fadd = atomicrmw fadd ptr addrspace(7) %p, bfloat %f syncscope("agent") seq_cst, align 4
+ ret bfloat %fadd
+}
+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #2 = { strictfp }
>From f9472f4b84ba08d7ce966b5018f46533243fb2b9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 10 Jun 2024 12:48:06 +0200
Subject: [PATCH 2/2] AMDGPU: Add some codegen tests for ptrmask with fat
pointers
---
llvm/test/CodeGen/AMDGPU/ptrmask.ll | 80 +++++++++++++++++++++++++++++
1 file changed, 80 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
index 7062270678933..ff0b95fe9ad41 100644
--- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
@@ -65,6 +65,86 @@ define amdgpu_ps ptr addrspace(3) @s_ptrmask_local_variable_i32(ptr addrspace(3)
ret ptr addrspace(3) %masked
}
+define ptr addrspace(7) @v_ptrmask_buffer_fat_ptr_variable_i32(ptr addrspace(7) %ptr, i32 %mask) {
+; GCN-LABEL: v_ptrmask_buffer_fat_ptr_variable_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, v4, v5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_ptrmask_buffer_fat_ptr_variable_i32:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_and_b32_e32 v4, v4, v5
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 %mask)
+ ret ptr addrspace(7) %masked
+}
+
+define ptr addrspace(7) @v_ptrmask_buffer_fat_ptr_i32_neg8(ptr addrspace(7) %ptr) {
+; GCN-LABEL: v_ptrmask_buffer_fat_ptr_i32_neg8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v4, -8, v4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_ptrmask_buffer_fat_ptr_i32_neg8:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_and_b32_e32 v4, -8, v4
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 -8)
+ ret ptr addrspace(7) %masked
+}
+
+define amdgpu_ps ptr addrspace(7) @s_ptrmask_buffer_fat_ptr_variable_i32(ptr addrspace(7) inreg %ptr, i32 inreg %mask) {
+; GCN-LABEL: s_ptrmask_buffer_fat_ptr_variable_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s8, s4
+; GCN-NEXT: s_mov_b32 s1, s3
+; GCN-NEXT: s_mov_b32 s0, s2
+; GCN-NEXT: s_and_b32 s4, s6, s7
+; GCN-NEXT: s_mov_b32 s2, s8
+; GCN-NEXT: s_mov_b32 s3, s5
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_ptrmask_buffer_fat_ptr_variable_i32:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b32 s8, s4
+; GFX10PLUS-NEXT: s_mov_b32 s1, s3
+; GFX10PLUS-NEXT: s_mov_b32 s0, s2
+; GFX10PLUS-NEXT: s_and_b32 s4, s6, s7
+; GFX10PLUS-NEXT: s_mov_b32 s2, s8
+; GFX10PLUS-NEXT: s_mov_b32 s3, s5
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 %mask)
+ ret ptr addrspace(7) %masked
+}
+
+define amdgpu_ps ptr addrspace(7) @s_ptrmask_buffer_fat_ptr_i32_neg8(ptr addrspace(7) inreg %ptr) {
+; GCN-LABEL: s_ptrmask_buffer_fat_ptr_i32_neg8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s7, s4
+; GCN-NEXT: s_mov_b32 s1, s3
+; GCN-NEXT: s_mov_b32 s0, s2
+; GCN-NEXT: s_and_b32 s4, s6, -8
+; GCN-NEXT: s_mov_b32 s2, s7
+; GCN-NEXT: s_mov_b32 s3, s5
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX10PLUS-LABEL: s_ptrmask_buffer_fat_ptr_i32_neg8:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_mov_b32 s7, s4
+; GFX10PLUS-NEXT: s_mov_b32 s1, s3
+; GFX10PLUS-NEXT: s_mov_b32 s0, s2
+; GFX10PLUS-NEXT: s_and_b32 s4, s6, -8
+; GFX10PLUS-NEXT: s_mov_b32 s2, s7
+; GFX10PLUS-NEXT: s_mov_b32 s3, s5
+; GFX10PLUS-NEXT: ; return to shader part epilog
+ %masked = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %ptr, i32 -8)
+ ret ptr addrspace(7) %masked
+}
+
declare ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3), i32) #0
declare ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1), i64) #0
More information about the llvm-commits
mailing list