[llvm] AMDGPU: Fix buffer intrinsic handling for various 16-bit elements. (PR #95376)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 13 02:17:15 PDT 2024


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/95376

Mostly fixes handling of bfloat vectors, but also some missing
i16 cases.

>From d83b78299c907f5a7cb3f52a612356ff84dea467 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 8 Jun 2024 11:00:59 +0200
Subject: [PATCH] AMDGPU: Fix buffer intrinsic handling for various 16-bit
 elements.

Mostly fixes handling of bfloat vectors, but also some missing
i16 cases.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |   8 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  17 +-
 .../llvm.amdgcn.raw.ptr.buffer.load.bf16.ll   | 183 ++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll |  66 ++++++-
 .../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll  | 139 +++++++++++++
 .../llvm.amdgcn.raw.ptr.buffer.store.ll       |  41 ++++
 6 files changed, 439 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 496548382d528..94dd45f1333b0 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1425,19 +1425,23 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2bf16, "BUFFER_LOAD_DWORD">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i64, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f64, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4bf16, "BUFFER_LOAD_DWORDX2">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i64, "BUFFER_LOAD_DWORDX4">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f64, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8i16, "BUFFER_LOAD_DWORDX4">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8f16, "BUFFER_LOAD_DWORDX4">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8bf16, "BUFFER_LOAD_DWORDX4">;
 
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
@@ -1532,12 +1536,14 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2bf16, "BUFFER_STORE_DWORD">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i64, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f64, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4bf16, "BUFFER_STORE_DWORDX2">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1545,6 +1551,8 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i64, "BUFFER_STORE_DWORDX4">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f64, "BUFFER_STORE_DWORDX4">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8f16, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8i16, "BUFFER_STORE_DWORDX4">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8bf16, "BUFFER_STORE_DWORDX4">;
 
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6f9c88e617617..4946129c65a95 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -859,19 +859,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
                      {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
-                      MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
+                      MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
+                      MVT::i8},
                      Custom);
 
   setOperationAction(ISD::INTRINSIC_W_CHAIN,
-                     {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
-                      MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
-                      MVT::i16, MVT::i8, MVT::i128},
+                     {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
+                      MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
+                      MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
+                      MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
                      Custom);
 
   setOperationAction(ISD::INTRINSIC_VOID,
-                     {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
-                      MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
-                      MVT::i8, MVT::i128},
+                     {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
+                      MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
+                      MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
+                      MVT::f16, MVT::i16, MVT::i8, MVT::i128},
                      Custom);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
new file mode 100644
index 0000000000000..3c800d0369e70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+
+define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call bfloat @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret bfloat %val
+}
+
+define <2 x bfloat> @raw_ptr_buffer_load_v2bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <2 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v2bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret <2 x bfloat> %val
+}
+
+define <4 x bfloat> @raw_ptr_buffer_load_v4bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <4 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v4bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret <4 x bfloat> %val
+}
+
+; FIXME
+; define <6 x bfloat> @raw_ptr_buffer_load_v6bf16(ptr addrspace(8) inreg %rsrc) {
+;   %val = call <6 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v6bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+;   ret <6 x bfloat> %val
+; }
+
+define <8 x bfloat> @raw_ptr_buffer_load_v8bf16(ptr addrspace(8) inreg %rsrc) {
+; GFX7-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <8 x bfloat> @llvm.amdgcn.raw.ptr.buffer.load.v8bf16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret <8 x bfloat> %val
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index 07a5b511f2dc8..3e3371091ef72 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -944,7 +944,7 @@ main_body:
 
 define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
 ; PREGFX10-LABEL: raw_ptr_buffer_load_v4f16:
-; PREGFX10:       ; %bb.0: ; %main_body
+; PREGFX10:       ; %bb.0:
 ; PREGFX10-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
 ; PREGFX10-NEXT:    s_mov_b32 m0, -1
 ; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -952,24 +952,49 @@ define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, p
 ; PREGFX10-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: raw_ptr_buffer_load_v4f16:
-; GFX10:       ; %bb.0: ; %main_body
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v0, v[1:2]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: raw_ptr_buffer_load_v4f16:
-; GFX11:       ; %bb.0: ; %main_body
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ds_store_b64 v0, v[1:2]
 ; GFX11-NEXT:    s_endpgm
-main_body:
   %val = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   store <4 x half> %val, ptr addrspace(3) %ptr
   ret void
 }
 
+; FIXME
+; define amdgpu_ps void @raw_ptr_buffer_load_v6f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+;   %val = call <6 x half> @llvm.amdgcn.raw.ptr.buffer.load.v6f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+;   store <6 x half> %val, ptr addrspace(3) %ptr
+;   ret void
+; }
+
+define amdgpu_ps void @raw_ptr_buffer_load_v8f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+; GFX10-LABEL: raw_ptr_buffer_load_v8f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    buffer_load_dwordx4 v[1:4], off, s[0:3], 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b128 v0, v[1:4]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ds_store_b128 v0, v[1:4]
+; GFX11-NEXT:    s_endpgm
+  %val = call <8 x half> @llvm.amdgcn.raw.ptr.buffer.load.v8f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  store <8 x half> %val, ptr addrspace(3) %ptr
+  ret void
+}
+
 define amdgpu_ps void @raw_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
 ; PREGFX10-LABEL: raw_ptr_buffer_load_v2i16:
 ; PREGFX10:       ; %bb.0: ; %main_body
@@ -1000,7 +1025,7 @@ main_body:
 
 define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
 ; PREGFX10-LABEL: raw_ptr_buffer_load_v4i16:
-; PREGFX10:       ; %bb.0: ; %main_body
+; PREGFX10:       ; %bb.0:
 ; PREGFX10-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
 ; PREGFX10-NEXT:    s_mov_b32 m0, -1
 ; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1008,24 +1033,49 @@ define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, p
 ; PREGFX10-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: raw_ptr_buffer_load_v4i16:
-; GFX10:       ; %bb.0: ; %main_body
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_write_b64 v0, v[1:2]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: raw_ptr_buffer_load_v4i16:
-; GFX11:       ; %bb.0: ; %main_body
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ds_store_b64 v0, v[1:2]
 ; GFX11-NEXT:    s_endpgm
-main_body:
   %val = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   store <4 x i16> %val, ptr addrspace(3) %ptr
   ret void
 }
 
+; FIXME
+; define amdgpu_ps void @raw_ptr_buffer_load_v6i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+;   %val = call <6 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v6i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+;   store <6 x i16> %val, ptr addrspace(3) %ptr
+;   ret void
+; }
+
+define amdgpu_ps void @raw_ptr_buffer_load_v8i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) {
+; GFX10-LABEL: raw_ptr_buffer_load_v8i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    buffer_load_dwordx4 v[1:4], off, s[0:3], 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b128 v0, v[1:4]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_ptr_buffer_load_v8i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    ds_store_b128 v0, v[1:4]
+; GFX11-NEXT:    s_endpgm
+  %val = call <8 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v8i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  store <8 x i16> %val, ptr addrspace(3) %ptr
+  ret void
+}
+
 define amdgpu_ps void @raw_ptr_buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) {
 ; PREGFX10-LABEL: raw_ptr_buffer_load_x1_offset_merged:
 ; PREGFX10:       ; %bb.0: ; %main_body
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
new file mode 100644
index 0000000000000..f7f3742a90633
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11 %s
+
+; FIXME
+; define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
+;   call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+;   ret void
+; }
+
+define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
+; GFX7-LABEL: buffer_store_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: buffer_store_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_store_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 {
+; GFX7-LABEL: buffer_store_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: buffer_store_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_store_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
+
+; FIXME
+; define amdgpu_ps void @buffer_store_v6bf16(ptr addrspace(8) inreg %rsrc, <6 x bfloat> %data, i32 %offset) #0 {
+;   call void @llvm.amdgcn.raw.ptr.buffer.store.v6bf16(<6 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+;   ret void
+; }
+
+define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bfloat> %data, i32 %offset) #0 {
+; GFX7-LABEL: buffer_store_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: buffer_store_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_store_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
index b66cccd0b7e8a..8dd9b4ab61d4f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -246,6 +246,31 @@ main_body:
   ret void
 }
 
+;CHECK-LABEL: {{^}}buffer_store_v8f16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 {
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_v2bf16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_v4bf16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 {
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
+
 ;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16:
 ;CHECK-NEXT: %bb.
 ;CHECK-NOT: v0
@@ -276,6 +301,22 @@ main_body:
   ret void
 }
 
+; FIXME:
+; define amdgpu_ps void @buffer_store_v6i16(ptr addrspace(8) inreg %rsrc, <6 x i16> %data, i32 %offset) #0 {
+; main_body:
+;   call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+;   ret void
+; }
+
+;CHECK-LABEL: {{^}}buffer_store_v8i16:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 {
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
+  ret void
+}
+
 ;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged:
 ;CHECK-NOT: s_waitcnt
 ;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4



More information about the llvm-commits mailing list