[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 30 17:22:43 PST 2024
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79236
>From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Tue, 23 Jan 2024 19:19:00 -0600
Subject: [PATCH 1/2] [AMDGPU] Emit a waitcnt instruction after each memory
instruction
This patch introduces a new command-line option for clang, namely,
amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction
is generated after each memory load/store instruction. The counter values are
always 0, but which counters are involved depends on the memory instruction.
---
clang/include/clang/Driver/Options.td | 4 +
clang/test/Driver/amdgpu-features.c | 6 +
llvm/lib/Target/AMDGPU/AMDGPU.td | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 79 +++++++
.../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 199 ++++++++++++++++++
6 files changed, 295 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7f4fa33748fac..d570786534b36 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4796,6 +4796,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable",
defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64",
"Specify wavefront size 64", "Specify wavefront size 32",
" mode (AMDGPU only)">;
+defm amdgpu_precise_memory_op
+ : SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable",
+ " precise memory mode (AMDGPU only)",
+ m_amdgpu_Features_Group>;
defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics",
TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse,
diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c
index a516bc6b7ff20..57d31ccedd878 100644
--- a/clang/test/Driver/amdgpu-features.c
+++ b/clang/test/Driver/amdgpu-features.c
@@ -32,3 +32,9 @@
// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s
// NO-CUMODE: "-target-feature" "-cumode"
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s
+// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op"
+
+// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s
+// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb29d5d947598..c39cc94770235 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode",
"Enable CU wavefront execution mode"
>;
+def FeaturePreciseMemory
+ : SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory",
+ "true", "Enable precise memory mode">;
+
def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug",
"SGPRInitBug",
"true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8019b98b1c68d..b69df21f78598 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool EnableTgSplit = false;
bool EnableCuMode = false;
bool TrapHandler = false;
+ bool EnablePreciseMemory = false;
// Used as options.
bool EnableLoadStoreOpt = false;
@@ -592,6 +593,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return EnableCuMode;
}
+ bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
+
bool hasFlatAddressSpace() const {
return FlatAddressSpace;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 84b9330ef9633..93cdceb37bd50 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUMachineModuleInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/BitmaskEnum.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -24,6 +25,8 @@
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
+#include <iostream>
+
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
+ bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF);
+ bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF);
+
public:
static char ID;
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
+bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ IsaVersion IV = getIsaVersion(ST.getCPU());
+
+ bool Changed = false;
+
+ for (auto &MBB : MF) {
+ for (auto MI = MBB.begin(); MI != MBB.end();) {
+ MachineInstr &Inst = *MI;
+ ++MI;
+ if (Inst.mayLoadOrStore() == false)
+ continue;
+
+ // Todo: if next insn is an s_waitcnt
+ AMDGPU::Waitcnt Wait;
+
+ if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
+ if (TII->isSMRD(Inst)) { // scalar
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else // LDS load ?
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector store
+ if (TII->isVMEM(Inst)) // VMEM store
+ Wait.LoadCnt = 0; // VmCnt
+ else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else
+ Wait.DsCnt = 0; // LDS store? LgkmCnt
+ }
+ } // vector
+ } else { // atomic
+ Wait.DsCnt = 0; // LgkmCnt
+ Wait.LoadCnt = 0; // VmCnt
+ }
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+bool SIMemoryLegalizer::GFX10And11InsertWaitcntForPreciseMem(
+ MachineFunction &MF) {
+ for (auto &MBB : MF) {
+ for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+ MachineInstr &Inst = *MI;
+ if (Inst.mayLoadOrStore() == false)
+ continue;
+ }
+ }
+ return true;
+}
+
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
@@ -2601,6 +2671,15 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
}
Changed |= removeAtomicPseudoMIs();
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.isPreciseMemoryEnabled()) {
+ if (AMDGPU::isGFX10Plus(ST))
+ Changed |= GFX10And11InsertWaitcntForPreciseMem(MF);
+ else
+ Changed |= GFX9InsertWaitcntForPreciseMem(MF);
+ }
+
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
new file mode 100644
index 0000000000000..abb9b9071227f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
@@ -0,0 +1,199 @@
+; Testing the -amdgpu-precise-memory-op option
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
+; COM: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+
+; from atomicrmw-expand.ll
+; covers flat_load, flat_atomic
+define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
+; GFX90A-LABEL: syncscope_workgroup_nortn:
+; GFX90A: ; %bb.0:
+; GFX90A: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A: .LBB0_1: ; %atomicrmw.start
+; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+ %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
+ ret void
+}
+
+; from atomicrmw-nand.ll
+; covers global_atomic, global_load
+define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
+; GFX9-LABEL: atomic_nand_i32_global:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NOT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-NEXT: v_not_b32_e32 v2, v3
+; GFX9-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_cbranch_execnz .LBB1_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
+ ret i32 %result
+}
+
+; from bf16.ll
+; covers buffer_load, buffer_store, flat_load, flat_store, global_load, global_store
+define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+;
+; GFX9-LABEL: test_load_store:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_load_store:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_store_short v[2:3], v0, off
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+ %val = load bfloat, ptr addrspace(1) %in
+ store bfloat %val, ptr addrspace(1) %out
+ ret void
+}
+
+; from scratch-simple.ll
+; covers scratch_load, scratch_store
+;
+; GFX9-FLATSCR-LABEL: {{^}}vs_main:
+; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}],
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_vs float @vs_main(i32 %idx) {
+ %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
+ %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
+ %r = fadd float %v1, %v2
+ ret float %r
+}
+
+; from udiv.ll
+; covers s_load
+define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
+; GFX9-LABEL: udiv_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NOT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+ %r = udiv i32 %x, %y
+ store i32 %r, ptr addrspace(1) %out
+ ret void
+}
+
+declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
+
+; from smrd.ll
+; covers s_buffer_load
+; GFX9-LABEL: {{^}}smrd_sgpr_offset:
+; GFX9: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
+main_body:
+ %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
+ ret float %r
+}
+
+; from atomic_load_add.ll
+; covers s_load, ds_add
+; GFX9-LABEL: atomic_add_local:
+; GFX9: ; %bb.1:
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9: ds_add_u32 v0, v1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
+ %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
+ ret void
+}
+
+declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
+
+; from atomic_optimizations_buffer.ll
+; covers buffer_atomic
+; GFX9-LABEL: add_i32_constant:
+; GFX9: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
+entry:
+ %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
+ store i32 %old, ptr addrspace(1) %out
+ ret void
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32)
+
+; from llvm.amdgcn.image.load.a16.ll
+; covers image_load
+; GFX9-LABEL: {{^}}load.f32.1d:
+; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+ %x = extractelement <2 x i16> %coords, i32 0
+ %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32)
+
+; from llvm.amdgcn.image.store.a16.ll
+; covers image_store
+define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+; GFX9-LABEL: store_f32_1d:
+; GFX9: ; %bb.0: ; %main_body
+; GFX9-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+main_body:
+ %x = extractelement <2 x i16> %coords, i32 0
+ call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32)
+
+; from llvm.amdgcn.image.atomic.dim.ll
+; covers image_atomic
+; GFX90A-LABEL: {{^}}atomic_swap_1d:
+; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}}
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+main_body:
+ %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %out = bitcast i32 %v to float
+ ret float %out
+}
+
+
+
+
>From 09f10764afdf0cd16938cc3098f7b86585973edd Mon Sep 17 00:00:00 2001
From: Jun Wang <jun.wang7 at amd.com>
Date: Tue, 30 Jan 2024 19:21:01 -0600
Subject: [PATCH 2/2] Combined insertions of waitcnt with existing
SIMemoryLegalizer code.
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 294 +++++++++++++-----
...l => insert_waitcnt_for_precise_memory.ll} | 187 ++++++++++-
2 files changed, 385 insertions(+), 96 deletions(-)
rename llvm/test/CodeGen/AMDGPU/{insert_waitcnt_for_all.ll => insert_waitcnt_for_precise_memory.ll} (62%)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 93cdceb37bd50..233e719a85564 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -25,8 +25,6 @@
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/TargetParser/TargetParser.h"
-#include <iostream>
-
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -608,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
bool IsNonTemporal) const override;
};
+class SIPreciseMemorySupport {
+protected:
+ const GCNSubtarget &ST;
+ const SIInstrInfo *TII = nullptr;
+
+ IsaVersion IV;
+
+ SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) {
+ TII = ST.getInstrInfo();
+ IV = getIsaVersion(ST.getCPU());
+ }
+
+public:
+ static std::unique_ptr<SIPreciseMemorySupport> create(const GCNSubtarget &ST);
+
+ virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0;
+ /// Handles atomic instruction \p MI with \p ret indicating whether \p MI
+ /// returns a result.
+ virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0;
+};
+
+class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport {
+public:
+ SIGfx9PreciseMemorySupport(const GCNSubtarget &ST)
+ : SIPreciseMemorySupport(ST) {}
+ bool handleNonAtomic(MachineBasicBlock::iterator &MI) override;
+ bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override;
+};
+
+class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport {
+public:
+ SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST)
+ : SIPreciseMemorySupport(ST) {}
+ bool handleNonAtomic(MachineBasicBlock::iterator &MI) override;
+ bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override;
+};
+
+std::unique_ptr<SIPreciseMemorySupport>
+SIPreciseMemorySupport::create(const GCNSubtarget &ST) {
+ GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (Generation < AMDGPUSubtarget::GFX10)
+ return std::make_unique<SIGfx9PreciseMemorySupport>(ST);
+ return std::make_unique<SIGfx10And11PreciseMemorySupport>(ST);
+}
+
+bool SIGfx9PreciseMemorySupport ::handleNonAtomic(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ AMDGPU::Waitcnt Wait;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) { // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ } else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // LDS load
+ Wait.DsCnt = 0; // LgkmCnt
+ }
+ } else { // vector store
+ if (TII->isVMEM(Inst)) { // VMEM store
+ Wait.LoadCnt = 0; // VmCnt
+ } else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else {
+ Wait.DsCnt = 0; // LDS store; LgkmCnt
+ }
+ }
+ }
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ return true;
+}
+
+bool SIGfx9PreciseMemorySupport ::handleAtomic(MachineBasicBlock::iterator &MI,
+ bool ret) {
+ assert(MI->mayLoadOrStore());
+
+ AMDGPU::Waitcnt Wait;
+
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ return true;
+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleNonAtomic(
+ MachineBasicBlock::iterator &MI) {
+ assert(MI->mayLoadOrStore());
+
+ MachineInstr &Inst = *MI;
+ AMDGPU::Waitcnt Wait;
+
+ bool BuildWaitCnt = true;
+ bool BuildVsCnt = false;
+
+ if (TII->isSMRD(Inst)) { // scalar
+ if (Inst.mayStore())
+ return false;
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // vector
+ if (Inst.mayLoad()) { // vector load
+ if (TII->isVMEM(Inst)) { // VMEM load
+ Wait.LoadCnt = 0; // VmCnt
+ } else if (TII->isFLAT(Inst)) { // Flat load
+ Wait.LoadCnt = 0; // VmCnt
+ Wait.DsCnt = 0; // LgkmCnt
+ } else { // LDS load
+ Wait.DsCnt = 0; // LgkmCnt
+ }
+ }
+
+ // For some instructions, mayLoad() and mayStore() can be both true.
+ if (Inst.mayStore()) { // vector store; an instruction can be both
+ // load/store
+ if (TII->isVMEM(Inst)) { // VMEM store
+ if (!Inst.mayLoad())
+ BuildWaitCnt = false;
+ BuildVsCnt = true;
+ } else if (TII->isFLAT(Inst)) { // Flat store
+ Wait.DsCnt = 0; // LgkmCnt
+ BuildVsCnt = true;
+ } else {
+ Wait.DsCnt = 0; // LDS store; LgkmCnt
+ }
+ }
+ }
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ if (BuildWaitCnt) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ }
+
+ if (BuildVsCnt) {
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ --MI;
+ }
+ return true;
+}
+
+bool SIGfx10And11PreciseMemorySupport ::handleAtomic(
+ MachineBasicBlock::iterator &MI, bool ret) {
+ assert(MI->mayLoadOrStore());
+
+ AMDGPU::Waitcnt Wait;
+
+ Wait.DsCnt = 0; // LgkmCnt
+ if (ret)
+ Wait.LoadCnt = 0; // VmCnt
+
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ MachineBasicBlock &MBB = *MI->getParent();
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ --MI;
+ if (!ret) {
+ BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
+ --MI;
+ }
+ return true;
+}
+
class SIMemoryLegalizer final : public MachineFunctionPass {
private:
/// Cache Control.
std::unique_ptr<SICacheControl> CC = nullptr;
+ /// Precise Memory support.
+ std::unique_ptr<SIPreciseMemorySupport> PM = nullptr;
+
/// List of atomic pseudo instructions.
std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
@@ -644,9 +827,6 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MachineBasicBlock::iterator &MI);
- bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF);
- bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF);
-
public:
static char ID;
@@ -2437,6 +2617,9 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
SIMemOp::LOAD, MOI.isVolatile(),
MOI.isNonTemporal());
+ if (PM)
+ Changed |= PM->handleNonAtomic(MI);
+
return Changed;
}
@@ -2470,6 +2653,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
Changed |= CC->enableVolatileAndOrNonTemporal(
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
MOI.isNonTemporal());
+
+ if (PM)
+ Changed |= PM->handleNonAtomic(MI);
+
return Changed;
}
@@ -2550,12 +2737,13 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
- Changed |= CC->insertWait(MI, MOI.getScope(),
- MOI.getInstrAddrSpace(),
- isAtomicRet(*MI) ? SIMemOp::LOAD :
- SIMemOp::STORE,
- MOI.getIsCrossAddressSpaceOrdering(),
- Position::AFTER);
+ if (PM)
+ Changed |= PM->handleAtomic(MI, isAtomicRet(*MI));
+ else
+ Changed |= CC->insertWait(
+ MI, MOI.getScope(), MOI.getInstrAddrSpace(),
+ isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER);
Changed |= CC->insertAcquire(MI, MOI.getScope(),
MOI.getOrderingAddrSpace(),
Position::AFTER);
@@ -2564,79 +2752,22 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
return Changed;
}
- return Changed;
-}
+ if (PM)
+ Changed |= PM->handleNonAtomic(MI);
-bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- IsaVersion IV = getIsaVersion(ST.getCPU());
-
- bool Changed = false;
-
- for (auto &MBB : MF) {
- for (auto MI = MBB.begin(); MI != MBB.end();) {
- MachineInstr &Inst = *MI;
- ++MI;
- if (Inst.mayLoadOrStore() == false)
- continue;
-
- // Todo: if next insn is an s_waitcnt
- AMDGPU::Waitcnt Wait;
-
- if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
- if (TII->isSMRD(Inst)) { // scalar
- Wait.DsCnt = 0; // LgkmCnt
- } else { // vector
- if (Inst.mayLoad()) { // vector load
- if (TII->isVMEM(Inst)) // VMEM load
- Wait.LoadCnt = 0; // VmCnt
- else if (TII->isFLAT(Inst)) { // Flat load
- Wait.LoadCnt = 0; // VmCnt
- Wait.DsCnt = 0; // LgkmCnt
- } else // LDS load ?
- Wait.DsCnt = 0; // LgkmCnt
- } else { // vector store
- if (TII->isVMEM(Inst)) // VMEM store
- Wait.LoadCnt = 0; // VmCnt
- else if (TII->isFLAT(Inst)) { // Flat store
- Wait.LoadCnt = 0; // VmCnt
- Wait.DsCnt = 0; // LgkmCnt
- } else
- Wait.DsCnt = 0; // LDS store? LgkmCnt
- }
- } // vector
- } else { // atomic
- Wait.DsCnt = 0; // LgkmCnt
- Wait.LoadCnt = 0; // VmCnt
- }
-
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- Changed = true;
- }
- }
return Changed;
}
-bool SIMemoryLegalizer::GFX10And11InsertWaitcntForPreciseMem(
- MachineFunction &MF) {
- for (auto &MBB : MF) {
- for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
- MachineInstr &Inst = *MI;
- if (Inst.mayLoadOrStore() == false)
- continue;
- }
- }
- return true;
-}
-
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
SIMemOpAccess MOA(MF);
CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.isPreciseMemoryEnabled())
+ PM = SIPreciseMemorySupport::create(ST);
+
for (auto &MBB : MF) {
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
@@ -2655,8 +2786,12 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
MI = II->getIterator();
}
- if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
+ if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) {
+ if (PM && MI->mayLoadOrStore()) {
+ Changed |= PM->handleNonAtomic(MI);
+ }
continue;
+ }
if (const auto &MOI = MOA.getLoadInfo(MI))
Changed |= expandLoad(*MOI, MI);
@@ -2671,15 +2806,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
}
Changed |= removeAtomicPseudoMIs();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.isPreciseMemoryEnabled()) {
- if (AMDGPU::isGFX10Plus(ST))
- Changed |= GFX10And11InsertWaitcntForPreciseMem(MF);
- else
- Changed |= GFX9InsertWaitcntForPreciseMem(MF);
- }
-
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
similarity index 62%
rename from llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
rename to llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index abb9b9071227f..8d2c0c73aa152 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -1,8 +1,7 @@
-; Testing the -amdgpu-precise-memory-op option
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A
-; COM: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX90A
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
; from atomicrmw-expand.ll
; covers flat_load, flat_atomic
@@ -14,6 +13,14 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
; GFX90A: .LBB0_1: ; %atomicrmw.start
; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+;
+; GFX10-LABEL: syncscope_workgroup_nortn:
+; GFX10: ; %bb.0:
+; GFX10: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10: .LBB0_1: ; %atomicrmw.start
+; GFX10: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
%res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
ret void
}
@@ -44,6 +51,28 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: atomic_nand_i32_global:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NOT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_not_b32_e32 v2, v3
+; GFX10-NEXT: v_or_b32_e32 v2, -5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB1_1
%result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
ret i32 %result
}
@@ -65,10 +94,9 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_store_short v[2:3], v0, off
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) %in
@@ -101,6 +129,16 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NOT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+;
+; GFX10-LABEL: udiv_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX10: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
%r = udiv i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -113,6 +151,10 @@ declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
; GFX9-LABEL: {{^}}smrd_sgpr_offset:
; GFX9: s_buffer_load_dword s{{[0-9]}}, s[0:3], s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+;
+; GFX10-LABEL: {{^}}smrd_sgpr_offset:
+; GFX10: s_buffer_load_dword s0, s[0:3], s4 offset:0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
main_body:
%r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
@@ -127,6 +169,15 @@ main_body:
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9: ds_add_u32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+;
+; GFX10-LABEL: atomic_add_local:
+; GFX10: ; %bb.1:
+; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10: ds_add_u32 v0, v1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+
define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
%unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
ret void
@@ -141,7 +192,16 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9: buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+;
+; GFX10-LABEL: add_i32_constant:
+; GFX10: ; %bb.1:
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10: buffer_atomic_add v1, off, s[4:7], 0 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+
define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
entry:
%old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
@@ -155,7 +215,13 @@ declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i3
; covers image_load
; GFX9-LABEL: {{^}}load.f32.1d:
; GFX9: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+;
+; GFX10-LABEL: {{^}}load.f32.1d:
+; GFX10: %bb.0: ; %main_body
+; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+
define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
main_body:
%x = extractelement <2 x i16> %coords, i32 0
@@ -171,9 +237,14 @@ define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4
; GFX9-LABEL: store_f32_1d:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
+; GFX10-LABEL: store_f32_1d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: s_endpgm
main_body:
%x = extractelement <2 x i16> %coords, i32 0
call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
@@ -186,7 +257,14 @@ declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32,
; covers image_atomic
; GFX90A-LABEL: {{^}}atomic_swap_1d:
; GFX90A: image_atomic_swap v0, v{{[02468]}}, s[0:7] dmask:0x1 unorm glc{{$}}
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+;
+; GFX10-LABEL: {{^}}atomic_swap_1d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+
define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -194,6 +272,91 @@ main_body:
ret float %out
}
+; from lds-bounds.ll
+; covers ds_write_b64
+ at compute_lds = external addrspace(3) global [512 x i32], align 16
+; GFX9-LABEL: {{^}}store_aligned:
+; GFX9: ds_write_b64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+;
+; GFX10-LABEL: {{^}}store_aligned:
+; GFX10: ds_write_b64 v0, v[1:2]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+
+
+define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 {
+entry:
+ %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
+
+ store i32 42, ptr addrspace(3) %ptr, align 8
+ store i32 43, ptr addrspace(3) %ptr.gep.1
+ ret void
+}
+
+; from lds-bounds.ll
+; covers ds_read_b64
+; GFX9-LABEL: {{^}}load_aligned:
+; GFX9: ds_read_b64
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+;
+; GFX10-LABEL: {{^}}load_aligned:
+; GFX10: ds_read_b64 v[0:1], v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 {
+entry:
+ %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
+
+ %v.0 = load i32, ptr addrspace(3) %ptr, align 8
+ %v.1 = load i32, ptr addrspace(3) %ptr.gep.1
+
+ %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+ %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+ %bc = bitcast <2 x i32> %r.1 to <2 x float>
+ ret <2 x float> %bc
+}
+
+; from lds-bounds.ll
+; covers ds_write2_b32
+; GFX9-LABEL: {{^}}store_global_const_idx:
+; GFX9: ds_write2_b32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+;
+; GFX10-LABEL: {{^}}store_global_const_idx:
+; GFX10: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+
+define amdgpu_cs void @store_global_const_idx() #0 {
+entry:
+ %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
+ %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
+
+ store i32 42, ptr addrspace(3) %ptr.a
+ store i32 43, ptr addrspace(3) %ptr.b
+ ret void
+}
+
+; from lds-bounds.ll
+; covers ds_read2_b32
+; GFX9-LABEL: {{^}}load_global_const_idx:
+; GFX9: ds_read2_b32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+;
+; GFX10-LABEL: {{^}}load_global_const_idx:
+; GFX10: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+define amdgpu_cs <2 x float> @load_global_const_idx() #0 {
+entry:
+ %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
+ %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
+
+ %v.0 = load i32, ptr addrspace(3) %ptr.a
+ %v.1 = load i32, ptr addrspace(3) %ptr.b
+
+ %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+ %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+ %bc = bitcast <2 x i32> %r.1 to <2 x float>
+ ret <2 x float> %bc
+}
More information about the llvm-commits
mailing list