[llvm] 266b5db - [AMDGPU] Add MIMG NSA threshold configuration attribute
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 28 04:04:06 PDT 2022
Author: Carl Ritson
Date: 2022-09-28T20:03:18+09:00
New Revision: 266b5dbc5dd4888d03388a6a23533da37c0577c5
URL: https://github.com/llvm/llvm-project/commit/266b5dbc5dd4888d03388a6a23533da37c0577c5
DIFF: https://github.com/llvm/llvm-project/commit/266b5dbc5dd4888d03388a6a23533da37c0577c5.diff
LOG: [AMDGPU] Add MIMG NSA threshold configuration attribute
Make MIMG NSA minimum addresses threshold an attribute that can
be set on a function or configured via command line.
This enables frontend tuning which allows increased NSA usage
where beneficial.
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D134780
Added:
llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1ebdff4ff2227..e7ec24a930f52 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4863,6 +4863,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
+ const MachineFunction &MF = *MI.getMF();
const unsigned NumDefs = MI.getNumExplicitDefs();
const unsigned ArgOffset = NumDefs + 1;
bool IsTFE = NumDefs == 2;
@@ -4966,7 +4967,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
IsG16);
// See also below in the non-a16 branch
- const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
+ const bool UseNSA = ST.hasNSAEncoding() &&
+ PackedRegs.size() >= ST.getNSAThreshold(MF) &&
PackedRegs.size() <= ST.getNSAMaxSize();
if (!UseNSA && PackedRegs.size() > 1) {
@@ -5008,7 +5010,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// TODO: we can actually allow partial NSA where the final register is a
// contiguous set of the remaining addresses.
// This could help where there are more addresses than supported.
- const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
+ const bool UseNSA = ST.hasNSAEncoding() &&
+ CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
CorrectedNumVAddrs <= ST.getNSAMaxSize();
if (!UseNSA && Intr->NumVAddrs > 1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index a15f3a26a3f7e..e33f5d079915b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -54,6 +54,10 @@ static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
cl::desc("Enable the use of AA during codegen."),
cl::init(true));
+static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
+ cl::desc("Number of addresses from which to enable MIMG NSA."),
+ cl::init(3), cl::Hidden);
+
GCNSubtarget::~GCNSubtarget() = default;
GCNSubtarget &
@@ -950,6 +954,17 @@ GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
: nullptr;
}
+unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
+ if (NSAThreshold.getNumOccurrences() > 0)
+ return std::max(NSAThreshold.getValue(), 2u);
+
+ int Value = AMDGPU::getIntegerAttribute(MF.getFunction(), "amdgpu-nsa-threshold", -1);
+ if (Value > 0)
+ return std::max(Value, 2);
+
+ return 3;
+}
+
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index cd0564f1e74d0..4eb1cf8300f47 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1305,6 +1305,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if it's beneficial on this subtarget for the scheduler to
// cluster stores as well as loads.
bool shouldClusterStores() const { return getGeneration() >= GFX11; }
+
+ // \returns the number of address arguments from which to enable MIMG NSA
+ // on supported architectures.
+ unsigned getNSAThreshold(const MachineFunction &MF) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4caaf4d3c17e2..781039689efe6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6522,7 +6522,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// contiguous set of the remaining addresses.
// This could help where there are more addresses than supported.
bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
- VAddrs.size() >= 3 &&
+ VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) &&
VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
SDValue VAddr;
if (!UseNSA)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll
new file mode 100644
index 0000000000000..36adc9675eb8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll
@@ -0,0 +1,285 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=ATTRIB %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-2 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=3 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-3 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=4 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-4 %s
+
+; Note: command line argument should override function attribute.
+
+define amdgpu_ps <4 x float> @sample_2d_nsa2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) #2 {
+; ATTRIB-LABEL: sample_2d_nsa2:
+; ATTRIB: ; %bb.0: ; %main_body
+; ATTRIB-NEXT: s_mov_b32 s12, exec_lo
+; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo
+; ATTRIB-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; ATTRIB-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; ATTRIB-NEXT: s_waitcnt vmcnt(0)
+; ATTRIB-NEXT: ; return to shader part epilog
+;
+; FORCE-2-LABEL: sample_2d_nsa2:
+; FORCE-2: ; %bb.0: ; %main_body
+; FORCE-2-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-2-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-2-NEXT: s_waitcnt vmcnt(0)
+; FORCE-2-NEXT: ; return to shader part epilog
+;
+; FORCE-3-LABEL: sample_2d_nsa2:
+; FORCE-3: ; %bb.0: ; %main_body
+; FORCE-3-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-3-NEXT: v_mov_b32_e32 v2, v0
+; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-3-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-3-NEXT: s_waitcnt vmcnt(0)
+; FORCE-3-NEXT: ; return to shader part epilog
+;
+; FORCE-4-LABEL: sample_2d_nsa2:
+; FORCE-4: ; %bb.0: ; %main_body
+; FORCE-4-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-4-NEXT: v_mov_b32_e32 v2, v0
+; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-4-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-4-NEXT: s_waitcnt vmcnt(0)
+; FORCE-4-NEXT: ; return to shader part epilog
+main_body:
+ %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_3d_nsa2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) #2 {
+; ATTRIB-LABEL: sample_3d_nsa2:
+; ATTRIB: ; %bb.0: ; %main_body
+; ATTRIB-NEXT: s_mov_b32 s12, exec_lo
+; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo
+; ATTRIB-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; ATTRIB-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; ATTRIB-NEXT: s_waitcnt vmcnt(0)
+; ATTRIB-NEXT: ; return to shader part epilog
+;
+; FORCE-2-LABEL: sample_3d_nsa2:
+; FORCE-2: ; %bb.0: ; %main_body
+; FORCE-2-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-2-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-2-NEXT: s_waitcnt vmcnt(0)
+; FORCE-2-NEXT: ; return to shader part epilog
+;
+; FORCE-3-LABEL: sample_3d_nsa2:
+; FORCE-3: ; %bb.0: ; %main_body
+; FORCE-3-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-3-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-3-NEXT: s_waitcnt vmcnt(0)
+; FORCE-3-NEXT: ; return to shader part epilog
+;
+; FORCE-4-LABEL: sample_3d_nsa2:
+; FORCE-4: ; %bb.0: ; %main_body
+; FORCE-4-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-4-NEXT: v_mov_b32_e32 v3, v0
+; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-4-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-4-NEXT: s_waitcnt vmcnt(0)
+; FORCE-4-NEXT: ; return to shader part epilog
+main_body:
+ %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_2d_nsa3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) #3 {
+; ATTRIB-LABEL: sample_2d_nsa3:
+; ATTRIB: ; %bb.0: ; %main_body
+; ATTRIB-NEXT: s_mov_b32 s12, exec_lo
+; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo
+; ATTRIB-NEXT: v_mov_b32_e32 v2, v0
+; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; ATTRIB-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; ATTRIB-NEXT: s_waitcnt vmcnt(0)
+; ATTRIB-NEXT: ; return to shader part epilog
+;
+; FORCE-2-LABEL: sample_2d_nsa3:
+; FORCE-2: ; %bb.0: ; %main_body
+; FORCE-2-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-2-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-2-NEXT: s_waitcnt vmcnt(0)
+; FORCE-2-NEXT: ; return to shader part epilog
+;
+; FORCE-3-LABEL: sample_2d_nsa3:
+; FORCE-3: ; %bb.0: ; %main_body
+; FORCE-3-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-3-NEXT: v_mov_b32_e32 v2, v0
+; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-3-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-3-NEXT: s_waitcnt vmcnt(0)
+; FORCE-3-NEXT: ; return to shader part epilog
+;
+; FORCE-4-LABEL: sample_2d_nsa3:
+; FORCE-4: ; %bb.0: ; %main_body
+; FORCE-4-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-4-NEXT: v_mov_b32_e32 v2, v0
+; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-4-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-4-NEXT: s_waitcnt vmcnt(0)
+; FORCE-4-NEXT: ; return to shader part epilog
+main_body:
+ %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_3d_nsa3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) #3 {
+; ATTRIB-LABEL: sample_3d_nsa3:
+; ATTRIB: ; %bb.0: ; %main_body
+; ATTRIB-NEXT: s_mov_b32 s12, exec_lo
+; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo
+; ATTRIB-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; ATTRIB-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; ATTRIB-NEXT: s_waitcnt vmcnt(0)
+; ATTRIB-NEXT: ; return to shader part epilog
+;
+; FORCE-2-LABEL: sample_3d_nsa3:
+; FORCE-2: ; %bb.0: ; %main_body
+; FORCE-2-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-2-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-2-NEXT: s_waitcnt vmcnt(0)
+; FORCE-2-NEXT: ; return to shader part epilog
+;
+; FORCE-3-LABEL: sample_3d_nsa3:
+; FORCE-3: ; %bb.0: ; %main_body
+; FORCE-3-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-3-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-3-NEXT: s_waitcnt vmcnt(0)
+; FORCE-3-NEXT: ; return to shader part epilog
+;
+; FORCE-4-LABEL: sample_3d_nsa3:
+; FORCE-4: ; %bb.0: ; %main_body
+; FORCE-4-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-4-NEXT: v_mov_b32_e32 v3, v0
+; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-4-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-4-NEXT: s_waitcnt vmcnt(0)
+; FORCE-4-NEXT: ; return to shader part epilog
+main_body:
+ %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_2d_nsa4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) #4 {
+; ATTRIB-LABEL: sample_2d_nsa4:
+; ATTRIB: ; %bb.0: ; %main_body
+; ATTRIB-NEXT: s_mov_b32 s12, exec_lo
+; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo
+; ATTRIB-NEXT: v_mov_b32_e32 v2, v0
+; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; ATTRIB-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; ATTRIB-NEXT: s_waitcnt vmcnt(0)
+; ATTRIB-NEXT: ; return to shader part epilog
+;
+; FORCE-2-LABEL: sample_2d_nsa4:
+; FORCE-2: ; %bb.0: ; %main_body
+; FORCE-2-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-2-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-2-NEXT: s_waitcnt vmcnt(0)
+; FORCE-2-NEXT: ; return to shader part epilog
+;
+; FORCE-3-LABEL: sample_2d_nsa4:
+; FORCE-3: ; %bb.0: ; %main_body
+; FORCE-3-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-3-NEXT: v_mov_b32_e32 v2, v0
+; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-3-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-3-NEXT: s_waitcnt vmcnt(0)
+; FORCE-3-NEXT: ; return to shader part epilog
+;
+; FORCE-4-LABEL: sample_2d_nsa4:
+; FORCE-4: ; %bb.0: ; %main_body
+; FORCE-4-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-4-NEXT: v_mov_b32_e32 v2, v0
+; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-4-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; FORCE-4-NEXT: s_waitcnt vmcnt(0)
+; FORCE-4-NEXT: ; return to shader part epilog
+main_body:
+ %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_3d_nsa4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) #4 {
+; ATTRIB-LABEL: sample_3d_nsa4:
+; ATTRIB: ; %bb.0: ; %main_body
+; ATTRIB-NEXT: s_mov_b32 s12, exec_lo
+; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo
+; ATTRIB-NEXT: v_mov_b32_e32 v3, v0
+; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; ATTRIB-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; ATTRIB-NEXT: s_waitcnt vmcnt(0)
+; ATTRIB-NEXT: ; return to shader part epilog
+;
+; FORCE-2-LABEL: sample_3d_nsa4:
+; FORCE-2: ; %bb.0: ; %main_body
+; FORCE-2-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-2-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-2-NEXT: s_waitcnt vmcnt(0)
+; FORCE-2-NEXT: ; return to shader part epilog
+;
+; FORCE-3-LABEL: sample_3d_nsa4:
+; FORCE-3: ; %bb.0: ; %main_body
+; FORCE-3-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-3-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-3-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-3-NEXT: s_waitcnt vmcnt(0)
+; FORCE-3-NEXT: ; return to shader part epilog
+;
+; FORCE-4-LABEL: sample_3d_nsa4:
+; FORCE-4: ; %bb.0: ; %main_body
+; FORCE-4-NEXT: s_mov_b32 s12, exec_lo
+; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo
+; FORCE-4-NEXT: v_mov_b32_e32 v3, v0
+; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; FORCE-4-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
+; FORCE-4-NEXT: s_waitcnt vmcnt(0)
+; FORCE-4-NEXT: ; return to shader part epilog
+main_body:
+ %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+ ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readonly "amdgpu-nsa-threshold"="2" }
+attributes #3 = { nounwind readonly "amdgpu-nsa-threshold"="3" }
+attributes #4 = { nounwind readonly "amdgpu-nsa-threshold"="4" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
index 7ca731d334d1f..018d0388f5bfb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll
@@ -1,15 +1,20 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX1010-NSA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX1030-NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX11-NSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
+; Default NSA threshold is 3 addresses
; GCN-LABEL: {{^}}sample_2d:
-;
-; TODO: use NSA here
-; GCN: v_mov_b32_e32 v2, v0
-;
-; GCN: image_sample v[0:3], v[1:2],
+; NONSA: v_mov_b32_e32 v2, v0
+; NONSA: image_sample v[0:3], v[1:2],
+; NSA-T2: image_sample v[0:3], [v1, v0],
+; NSA-T3: v_mov_b32_e32 v2, v0
+; NSA-T3: image_sample v[0:3], v[1:2],
define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) {
main_body:
%v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
More information about the llvm-commits
mailing list