[llvm] [AMDGPUInstCombineIntrinsic] Do not narrow 8,16-bit amdgcn_s_buffer_load instrinsics (PR #117997)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 28 04:11:09 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Juan Manuel Martinez CaamaƱo (jmmartinez)
<details>
<summary>Changes</summary>
If the memory address is out-of-range, the operation is not performed for any dwords that are out-of-range.
If we narrow a partially out-of-range `<i16x2>` load to `i16`; the `i16` load would read the memory values instead of `0`.
Orthogonally, due to a HW-bug on gfx12 we should not narrow s_buffer_loads to their 16/8-bit
variants for this platform. This is also covered by this change.
---
I still have some doubts about the following two cases:
```asm
%data = call <4 x half> @<!-- -->llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 0, i32 1>
```
```asm
%data = call <4 x half> @<!-- -->llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 1, i32 2>
```
Currently, we optimize them to:
```asm
%data = call <2 x half> @<!-- -->llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
```
```asm
%ofs.add = add i32 %ofs, i32 2
%data = call <2 x half> @<!-- -->llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs.add, i32 0)
```
**problem**: If the last element of the <4 x half> was out of bounds. I have the impression that the first transformation would still be valid, but not the second one. In the second case, both elements would be in-bounds while in the original <4 x half> load the last two elements would have been considered as out-of-bounds since they are in the same dword.
---
We can still narrow this:
```asm
%data = call <4 x half> @<!-- -->llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 0
ret half %elt1
```
Into this (narrowing the load from <4 x half> to <2 x half> and keeping the extractelement):
```asm
%data = call <2 x half> @<!-- -->llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <2 x half> %data, i32 0
ret half %elt1
```
---
Patch is 21.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117997.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+32-8)
- (modified) llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll (+22-19)
- (modified) llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll (+44-19)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 18a09c39a06387..4800fdb4493377 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1366,10 +1366,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
// and update offset.
DemandedElts = (1 << ActiveBits) - 1;
+ unsigned OffsetAdd = 0;
+ unsigned OffsetIdx;
+ unsigned SingleComponentSizeInBits =
+ IC.getDataLayout().getTypeSizeInBits(EltTy);
if (UnusedComponentsAtFront > 0) {
static const unsigned InvalidOffsetIdx = 0xf;
- unsigned OffsetIdx;
switch (II.getIntrinsicID()) {
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
@@ -1397,15 +1400,36 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
if (OffsetIdx != InvalidOffsetIdx) {
// Clear demanded bits and update the offset.
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
- auto *Offset = Args[OffsetIdx];
- unsigned SingleComponentSizeInBits =
- IC.getDataLayout().getTypeSizeInBits(EltTy);
- unsigned OffsetAdd =
- UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
- auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
- Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+ OffsetAdd = UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
}
}
+
+ unsigned NewLoadWidthInBits = SingleComponentSizeInBits * DemandedElts.popcount();
+ if (II.getIntrinsicID() == Intrinsic::amdgcn_s_buffer_load &&
+ NewLoadWidthInBits < 32) {
+ // From the GCN gen3 manual, section 7.4 (Scalar Memory Operations /
+ // Alignment and Bounds Checking) Memory Address - If the memory
+ // address is out-of-range (clamped), the operation is not performed
+ // for any dwords that are out-of-range.
+ //
+ // If we narrow a partially out-of-range <i16x2> load to i16; the i16
+ // load would read the memory values instead of 0.
+ //
+ // Orthogonally, due to a HW-bug on gfx12 we should not narrow
+ // s_buffer_loads to their 16/8-bit variants for this platform. These
+ // instructions are still supported but the user must ensure some
+ // alignment restrictions on the buffer's stride and num-records.
+ // This case is also covered by this condition.
+ return nullptr;
+ }
+
+ if (OffsetAdd) {
+ // Modify the IR after the previous condition, otherwise inst-combine
+ // would never reach a fixed-point due to the CreateAdd
+ auto *Offset = Args[OffsetIdx];
+ auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+ Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+ }
} else {
// Image case.
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
index ea6422e5ed8c2e..7578e5ecd17986 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
@@ -1059,7 +1059,8 @@ declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1
define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x half> [[DATA1]], i64 0
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1069,8 +1070,8 @@ define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x half> [[DATA1]], i64 1
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1080,8 +1081,8 @@ define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <3 x half> [[DATA1]], i64 1
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1091,8 +1092,8 @@ define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x half> [[DATA1]], i64 1
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1103,8 +1104,8 @@ define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x half> [[DATA1]], i64 3
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1129,7 +1130,8 @@ declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1
define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x i8> [[DATA1]], i64 0
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1139,8 +1141,8 @@ define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x i8> [[DATA1]], i64 1
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1150,8 +1152,8 @@ define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <3 x i8> [[DATA1]], i64 1
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1161,8 +1163,8 @@ define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x i8> [[DATA1]], i64 1
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1172,8 +1174,8 @@ define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 3
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x i8> [[DATA1]], i64 3
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1183,7 +1185,8 @@ define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = shufflevector <4 x i8> [[DATA1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: ret <2 x i8> [[DATA]]
;
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 66184405f30922..e5c046d5416f72 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1059,7 +1059,8 @@ declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1
define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x half> [[DATA1]], i64 0
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1069,8 +1070,8 @@ define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x half> [[DATA1]], i64 1
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1080,8 +1081,8 @@ define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <3 x half> [[DATA1]], i64 1
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1091,8 +1092,8 @@ define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x half> [[DATA1]], i64 1
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1103,8 +1104,8 @@ define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i
define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 6
-; CHECK-NEXT: [[DATA:%.*]] = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x half> [[DATA1]], i64 3
; CHECK-NEXT: ret half [[DATA]]
;
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1122,6 +1123,28 @@ define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inr
ret <2 x half> %shuf
}
+define amdgpu_ps <2 x half> @extract_elt1_elt2_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f16(
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 2
+; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: ret <2 x half> [[DATA]]
+;
+ %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
+ %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 1, i32 2>
+ ret <2 x half> %shuf
+}
+
+define amdgpu_ps <2 x half> @extract_elt2_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
+; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f16(
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 4
+; CHECK-NEXT: [[DATA:%.*]] = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: ret <2 x half> [[DATA]]
+;
+ %data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
+ %shuf = shufflevector <4 x half> %data, <4 x half> poison, <2 x i32> <i32 2, i32 3>
+ ret <2 x half> %shuf
+}
+
declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) #1
declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1
declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1
@@ -1129,7 +1152,8 @@ declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1
define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x i8> [[DATA1]], i64 0
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1139,8 +1163,8 @@ define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <2 x i8> [[DATA1]], i64 1
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1150,8 +1174,8 @@ define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <3 x i8> [[DATA1]], i64 1
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1161,8 +1185,8 @@ define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFS:%.*]], 1
-; CHECK-NEXT: [[DATA:%.*]] = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> [[RSRC:%.*]], i32 [[TMP1]], i32 0)
+; CHECK-NEXT: [[DATA1:%.*]] = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 0)
+; CHECK-NEXT: [[DATA:%.*]] = extractelement <4 x i8> [[DATA1]], i64 1
; CHECK-NEXT: ret i8 [[DATA]]
;
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
@@ -1172,8 +1196,8 @@ define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32
define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
-; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/117997
More information about the llvm-commits
mailing list