[llvm] ade0750 - [AMDGPU] Fix some cache policy checks for GFX12+ (#116396)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 21 00:23:04 PST 2024
Author: Jay Foad
Date: 2024-11-21T08:22:59Z
New Revision: ade0750e3529ee251cbfb60ce66904a8553381e4
URL: https://github.com/llvm/llvm-project/commit/ade0750e3529ee251cbfb60ce66904a8553381e4
DIFF: https://github.com/llvm/llvm-project/commit/ade0750e3529ee251cbfb60ce66904a8553381e4.diff
LOG: [AMDGPU] Fix some cache policy checks for GFX12+ (#116396)
Fix coding errors found by inspection and check that the swz bit still
serves to prevent merging of buffer loads/stores on GFX12+.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3522ece24f1c45..f2fe3befd04d55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3285,9 +3285,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
MIB.add(MI.getOperand(1)); // rsrc
MIB.add(MI.getOperand(5 + OpOffset)); // soffset
MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
+ bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
- MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
- MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
+ MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12)); // cpol
+ MIB.addImm(
+ Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
+ ? 1
+ : 0); // swz
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 428a19c391374f..344028c4b48689 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -160,7 +160,7 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
O << " dlc";
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
- if (Imm & ~CPol::ALL)
+ if (Imm & ~CPol::ALL_pregfx12)
O << " /* unexpected cache policy bit */";
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b02f9bf80d3fc..8c8d55bbc01316 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9860,11 +9860,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(Rsrc);
Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
+ bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
- Ops.push_back(
- DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
Ops.push_back(DAG.getTargetConstant(
- Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
+ Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
+ DL, MVT::i8)); // cpol
+ Ops.push_back(DAG.getTargetConstant(
+ Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
+ ? 1
+ : 0,
+ DL, MVT::i8)); // swz
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index 0c6bba2426947e..b42ba7d75094a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -1312,8 +1312,8 @@ main_body:
ret void
}
-define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
-; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc) {
+; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; PREGFX10: ; %bb.0: ; %main_body
; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
@@ -1327,7 +1327,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
; PREGFX10-NEXT: s_endpgm
;
-; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x5
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
@@ -1342,7 +1342,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 offset:4
@@ -1357,7 +1357,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE
@@ -1379,6 +1379,65 @@ main_body:
ret void
}
+define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
+; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; PREGFX10-NEXT: s_waitcnt vmcnt(1)
+; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; PREGFX10-NEXT: s_waitcnt vmcnt(0)
+; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
+; PREGFX10-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4
+; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null offset:4
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:8
+; GFX12-NEXT: buffer_load_b32 v2, off, s[0:3], null offset:12
+; GFX12-NEXT: buffer_load_b32 v3, off, s[0:3], null offset:16
+; GFX12-NEXT: buffer_load_b32 v4, off, s[0:3], null offset:28
+; GFX12-NEXT: buffer_load_b32 v5, off, s[0:3], null offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done
+; GFX12-NEXT: s_endpgm
+main_body:
+ %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 64)
+ %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 64)
+ %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 64)
+ %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 64)
+ %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 64)
+ %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 64)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index fd6e354b274a42..3493de1497d11f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
@@ -497,8 +498,8 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc,
ret void
}
-define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
-; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
; GFX68: ; %bb.0:
; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8
@@ -508,7 +509,7 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32
; GFX68-NEXT: s_endpgm
;
-; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4
@@ -527,6 +528,26 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
ret void
}
+define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4
+; GFX12-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8
+; GFX12-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12
+; GFX12-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16
+; GFX12-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28
+; GFX12-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 64)
+ ret void
+}
+
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
More information about the llvm-commits
mailing list