[llvm] AMDGPU: Fix buffer intrinsic handling for various 16-bit elements. (PR #95376)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 13 02:17:15 PDT 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/95376
Mostly fixes handling of bfloat vectors, but also some missing
i16 cases.
>From d83b78299c907f5a7cb3f52a612356ff84dea467 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 8 Jun 2024 11:00:59 +0200
Subject: [PATCH] AMDGPU: Fix buffer intrinsic handling for various 16-bit
elements.
Mostly fixes handling of bfloat vectors, but also some missing
i16 cases.
---
llvm/lib/Target/AMDGPU/BUFInstructions.td | 8 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 17 +-
.../llvm.amdgcn.raw.ptr.buffer.load.bf16.ll | 183 ++++++++++++++++++
.../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll | 66 ++++++-
.../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll | 139 +++++++++++++
.../llvm.amdgcn.raw.ptr.buffer.store.ll | 41 ++++
6 files changed, 439 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 496548382d528..94dd45f1333b0 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1425,19 +1425,23 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2bf16, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i64, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f64, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4bf16, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i64, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f64, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8i16, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8f16, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8bf16, "BUFFER_LOAD_DWORDX4">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
@@ -1532,12 +1536,14 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2bf16, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i64, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f64, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4bf16, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1545,6 +1551,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i64, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f64, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8f16, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8i16, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8bf16, "BUFFER_STORE_DWORDX4">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6f9c88e617617..4946129c65a95 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
+ MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
+ MVT::i8},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
- {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
- MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
- MVT::i16, MVT::i8, MVT::i128},
+ {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
+ MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
+ MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
+ MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::INTRINSIC_VOID,
- {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
- MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
- MVT::i8, MVT::i128},
+ {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
+ MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
+ MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
+ MVT::f16, MVT::i16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
new file mode 100644
index 0000000000000..3c800d0369e70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+
+define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ ret bfloat %val
+}
+
+define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ ret <2 x bfloat> %val
+}
+
+define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ ret <4 x bfloat> %val
+}
+
+; FIXME
+; define <6 x bfloat> @raw_ptr_buffer_load_v6bf16(ptr addrspace(8) inreg %rsrc) {
+; %val = call <6 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v6bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+; ret <6 x bfloat> %val
+; }
+
+define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ ret <8 x bfloat> %val
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 07a5b511f2dc8..3e3371091ef72 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -944,7 +944,7 @@ main_body:
define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
; PREGFX10-LABEL: raw_ptr_buffer_load_v4f16:
-; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10: ; %bb.0:
; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
; PREGFX10-NEXT: s_mov_b32 m0, -1
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
@@ -952,24 +952,49 @@ define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, p
; PREGFX10-NEXT: s_endpgm
;
; GFX10-LABEL: raw_ptr_buffer_load_v4f16:
-; GFX10: ; %bb.0: ; %main_body
+; GFX10: ; %bb.0:
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v0, v[1:2]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: raw_ptr_buffer_load_v4f16:
-; GFX11: ; %bb.0: ; %main_body
+; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ds_store_b64 v0, v[1:2]
; GFX11-NEXT: s_endpgm
-main_body:
%val = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
ret void
}
+; FIXME
+; define amdgpu_ps void @raw_ptr_buffer_load_v6f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+; %val = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+; store <6 x half> %val, ptr addrspace(3) %ptr
+; ret void
+; }
+
+define amdgpu_ps void @raw_ptr_buffer_load_v8f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+; GFX10-LABEL: raw_ptr_buffer_load_v8f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b128 v0, v[1:4]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v8f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ds_store_b128 v0, v[1:4]
+; GFX11-NEXT: s_endpgm
+ %val = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ store <8 x half> %val, ptr addrspace(3) %ptr
+ ret void
+}
+
define amdgpu_ps void @raw_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
; PREGFX10-LABEL: raw_ptr_buffer_load_v2i16:
; PREGFX10: ; %bb.0: ; %main_body
@@ -1000,7 +1025,7 @@ main_body:
define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
; PREGFX10-LABEL: raw_ptr_buffer_load_v4i16:
-; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10: ; %bb.0:
; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
; PREGFX10-NEXT: s_mov_b32 m0, -1
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
@@ -1008,24 +1033,49 @@ define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, p
; PREGFX10-NEXT: s_endpgm
;
; GFX10-LABEL: raw_ptr_buffer_load_v4i16:
-; GFX10: ; %bb.0: ; %main_body
+; GFX10: ; %bb.0:
; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_write_b64 v0, v[1:2]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: raw_ptr_buffer_load_v4i16:
-; GFX11: ; %bb.0: ; %main_body
+; GFX11: ; %bb.0:
; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ds_store_b64 v0, v[1:2]
; GFX11-NEXT: s_endpgm
-main_body:
%val = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
ret void
}
+; FIXME
+; define amdgpu_ps void @raw_ptr_buffer_load_v6i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+; %val = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+; store <6 x i16> %val, ptr addrspace(3) %ptr
+; ret void
+; }
+
+define amdgpu_ps void @raw_ptr_buffer_load_v8i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+; GFX10-LABEL: raw_ptr_buffer_load_v8i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b128 v0, v[1:4]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v8i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: ds_store_b128 v0, v[1:4]
+; GFX11-NEXT: s_endpgm
+ %val = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+ store <8 x i16> %val, ptr addrspace(3) %ptr
+ ret void
+}
+
define amdgpu_ps void @raw_ptr_buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) {
; PREGFX10-LABEL: raw_ptr_buffer_load_x1_offset_merged:
; PREGFX10: ; %bb.0: ; %main_body
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
new file mode 100644
index 0000000000000..f7f3742a90633
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+
+; FIXME
+; define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
+; call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+; ret void
+; }
+
+define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
+; GFX7-LABEL: buffer_store_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: buffer_store_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: buffer_store_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_v2bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 {
+; GFX7-LABEL: buffer_store_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: buffer_store_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: buffer_store_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
+
+; FIXME
+; define amdgpu_ps void @buffer_store_v6bf16(ptr addrspace(8) inreg %rsrc, <6 x bfloat> %data, i32 %offset) #0 {
+; call void @llvm.amdgcn.raw.ptr.buffer.store.v6bf16(<6 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+; ret void
+; }
+
+define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bfloat> %data, i32 %offset) #0 {
+; GFX7-LABEL: buffer_store_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX8-LABEL: buffer_store_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: buffer_store_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: buffer_store_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: buffer_store_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
index b66cccd0b7e8a..8dd9b4ab61d4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -246,6 +246,31 @@ main_body:
ret void
}
+;CHECK-LABEL: {{^}}buffer_store_v8f16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 {
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_v2bf16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_v4bf16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 {
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
+
;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16:
;CHECK-NEXT: %bb.
;CHECK-NOT: v0
@@ -276,6 +301,22 @@ main_body:
ret void
}
+; FIXME:
+; define amdgpu_ps void @buffer_store_v6i16(ptr addrspace(8) inreg %rsrc, <6 x i16> %data, i32 %offset) #0 {
+; main_body:
+; call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+; ret void
+; }
+
+;CHECK-LABEL: {{^}}buffer_store_v8i16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 {
+main_body:
+ call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+ ret void
+}
+
;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged:
;CHECK-NOT: s_waitcnt
;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
More information about the llvm-commits
mailing list