[llvm] [AMDGPU] Support D16 folding for image.sample with multiple extractelement and fptrunc users (PR #141758)
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 28 06:16:23 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-backend-amdgpu
Author: Harrison Hao (harrisonGPU)
<details>
<summary>Changes</summary>
Now we only support D16 folding for `image sample` instructions with a single user: a `fptrunc` to half.
However, we can actually support D16 folding for image.sample instructions with multiple users, as long as each user follows the pattern of extractelement followed by fptrunc to half.
For example:
```
%sample = call <4 x float> @<!-- -->llvm.amdgcn.image.sample
%e0 = extractelement <4 x float> %sample, i32 0
%h0 = fptrunc float %e0 to half
%e1 = extractelement <4 x float> %sample, i32 1
%h1 = fptrunc float %e1 to half
%e2 = extractelement <4 x float> %sample, i32 2
%h2 = fptrunc float %e2 to half
```
This change enables D16 folding for such cases and avoids generating `v_cvt_f16_f32_e32` instructions.
---
Patch is 37.95 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141758.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+62)
- (modified) llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll (+415-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 5f6ab24182d5e..2ff1501e2b977 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -269,6 +269,68 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
ArgTys[0] = User->getType();
});
}
+ } else {
+ // Only perform D16 folding if every user of the image sample is
+ // an ExtractElementInst immediately followed by an FPTrunc to half.
+ SmallVector<ExtractElementInst *, 4> Extracts;
+ SmallVector<FPTruncInst *, 4> Truncs;
+ bool AllHalfExtracts = true;
+
+ for (User *U : II.users()) {
+ auto *Ext = dyn_cast<ExtractElementInst>(U);
+ if (!Ext || !Ext->hasOneUse()) {
+ AllHalfExtracts = false;
+ break;
+ }
+ auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
+ if (!Tr || !Tr->getType()->getScalarType()->isHalfTy()) {
+ AllHalfExtracts = false;
+ break;
+ }
+ Extracts.push_back(Ext);
+ Truncs.push_back(Tr);
+ }
+
+ if (AllHalfExtracts && !Extracts.empty()) {
+ auto *VecTy = cast<VectorType>(II.getType());
+ unsigned NElts = VecTy->getElementCount().getKnownMinValue();
+ Type *HalfVecTy =
+ VectorType::get(Type::getHalfTy(II.getContext()), NElts, false);
+
+ // Obtain the original image sample intrinsic's signature
+ // and replace its return type with the half-vector for D16 folding
+ SmallVector<Type *, 8> SigTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys))
+ return nullptr;
+ SigTys[0] = HalfVecTy;
+
+ Module *M = II.getModule();
+ Function *HalfDecl =
+ Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
+
+ II.mutateType(HalfVecTy);
+ II.setCalledFunction(HalfDecl);
+
+ IRBuilder<> Builder(&II);
+ for (auto [lane, Ext] : enumerate(Extracts)) {
+ FPTruncInst *Tr = Truncs[lane];
+ Value *Idx = Ext->getIndexOperand();
+
+ Builder.SetInsertPoint(Tr);
+
+ Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
+ HalfExtract->takeName(Tr);
+
+ Tr->replaceAllUsesWith(HalfExtract);
+ }
+
+ for (auto *T : Truncs)
+ IC.eraseInstFromFunction(*T);
+ for (auto *E : Extracts)
+ IC.eraseInstFromFunction(*E);
+
+ return &II;
+ }
}
}
}
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index 30431ad724843..d39cceb9b549a 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx700 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX7 %s
; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
-; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
-; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX9 %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX10PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX11 %s
define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
@@ -16,6 +17,21 @@ define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4
; GFX81PLUS-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
; GFX81PLUS-NEXT: ret half [[TEX]]
;
+; GFX9-LABEL: @image_sample_2d_fptrunc_to_d16(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: ret half [[TEX]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_fptrunc_to_d16(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: ret half [[TEX]]
+;
+; GFX11-LABEL: @image_sample_2d_fptrunc_to_d16(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: ret half [[TEX]]
+;
main_body:
%tex = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_half = fptrunc float %tex to half
@@ -40,6 +56,30 @@ define amdgpu_ps half @image_sample_2d_v2f32(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX81PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_0]]
;
+; GFX9-LABEL: @image_sample_2d_v2f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: ret half [[ADDF_SUM_0]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_v2f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX10PLUS-NEXT: ret half [[ADDF_SUM_0]]
+;
+; GFX11-LABEL: @image_sample_2d_v2f32(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call <2 x half> @llvm.amdgcn.image.sample.lz.2d.v2f16.f32.v8i32.v4i32(i32 3, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[TEX_HALF_0:%.*]] = extractelement <2 x half> [[TEX]], i64 0
+; GFX11-NEXT: [[TEX_HALF_1:%.*]] = extractelement <2 x half> [[TEX]], i64 1
+; GFX11-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX11-NEXT: ret half [[ADDF_SUM_0]]
+;
main_body:
%tex = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_2_half = fptrunc <2 x float> %tex to <2 x half>
@@ -71,6 +111,36 @@ define amdgpu_ps half @image_sample_2d_v3f32(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX81PLUS-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_1]]
;
+; GFX9-LABEL: @image_sample_2d_v3f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
+; GFX9-NEXT: ret half [[ADDF_SUM_1]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_v3f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
+; GFX10PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX10PLUS-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
+; GFX10PLUS-NEXT: ret half [[ADDF_SUM_1]]
+;
+; GFX11-LABEL: @image_sample_2d_v3f32(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[TEX_HALF_0:%.*]] = extractelement <3 x half> [[TEX]], i64 0
+; GFX11-NEXT: [[TEX_HALF_1:%.*]] = extractelement <3 x half> [[TEX]], i64 1
+; GFX11-NEXT: [[TEX_HALF_2:%.*]] = extractelement <3 x half> [[TEX]], i64 2
+; GFX11-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX11-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[ADDF_SUM_0]], [[TEX_HALF_2]]
+; GFX11-NEXT: ret half [[ADDF_SUM_1]]
+;
main_body:
%tex = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_3_half = fptrunc <3 x float> %tex to <3 x half>
@@ -108,6 +178,42 @@ define amdgpu_ps half @image_sample_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX81PLUS-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_2]]
;
+; GFX9-LABEL: @image_sample_2d_v4f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX9-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX9-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX9-NEXT: ret half [[ADDF_SUM_2]]
+;
+; GFX10PLUS-LABEL: @image_sample_2d_v4f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX10PLUS-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX10PLUS-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX10PLUS-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX10PLUS-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX10PLUS-NEXT: ret half [[ADDF_SUM_2]]
+;
+; GFX11-LABEL: @image_sample_2d_v4f32(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX11-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX11-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX11-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX11-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX11-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX11-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX11-NEXT: ret half [[ADDF_SUM_2]]
+;
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
%tex_4_half = fptrunc <4 x float> %tex to <4 x half>
@@ -121,6 +227,79 @@ main_body:
ret half %addf_sum.2
}
+define void @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v, ptr addrspace(7) %out) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT: [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT: [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT: [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT: [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT: [[H2:%.*]] = fptrunc float [[E2]] to half
+; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX7-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX7-NEXT: ret void
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT: [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX81PLUS-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX81PLUS-NEXT: ret void
+;
+; GFX9-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[HALF_EXT2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX9-NEXT: [[HALF_EXT1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX9-NEXT: [[HALF_EXT:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX9-NEXT: [[MUL:%.*]] = fmul half [[HALF_EXT2]], [[HALF_EXT1]]
+; GFX9-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF_EXT]]
+; GFX9-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX9-NEXT: ret void
+;
+; GFX10PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[HALF_EXT2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX10PLUS-NEXT: [[HALF_EXT1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX10PLUS-NEXT: [[HALF_EXT:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX10PLUS-NEXT: [[MUL:%.*]] = fmul half [[HALF_EXT2]], [[HALF_EXT1]]
+; GFX10PLUS-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF_EXT]]
+; GFX10PLUS-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX10PLUS-NEXT: ret void
+;
+; GFX11-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX11-NEXT: main_body:
+; GFX11-NEXT: [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX11-NEXT: [[HALF_EXT2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX11-NEXT: [[HALF_EXT1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX11-NEXT: [[HALF_EXT:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX11-NEXT: [[MUL:%.*]] = fmul half [[HALF_EXT2]], [[HALF_EXT1]]
+; GFX11-NEXT: [[RES:%.*]] = fadd half [[MUL]], [[HALF_EXT]]
+; GFX11-NEXT: store half [[RES]], ptr addrspace(7) [[OUT:%.*]], align 2
+; GFX11-NEXT: ret void
+;
+main_body:
+ %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %e0 = extractelement <4 x float> %sample, i32 0
+ %h0 = fptrunc float %e0 to half
+ %e1 = extractelement <4 x float> %sample, i32 1
+ %h1 = fptrunc float %e1 to half
+ %e2 = extractelement <4 x float> %sample, i32 2
+ %h2 = fptrunc float %e2 to half
+ %mul = fmul half %h0, %h1
+ %res = fadd half %mul, %h2
+ store half %res, ptr addrspace(7) %out, align 2
+ ret void
+}
+
define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX7-LABEL: @image_gather4_2d_v4f32(
; GFX7-NEXT: main_body:
@@ -147,6 +326,42 @@ define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX81PLUS-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
; GFX81PLUS-NEXT: ret half [[ADDF_SUM_2]]
;
+; GFX9-LABEL: @image_gather4_2d_v4f32(
+; GFX9-NEXT: main_body:
+; GFX9-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX9-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX9-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX9-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX9-NEXT: [[TEX_HALF_3:%.*]] = extractelement <4 x half> [[TEX]], i64 3
+; GFX9-NEXT: [[ADDF_SUM_0:%.*]] = fadd half [[TEX_HALF_0]], [[TEX_HALF_1]]
+; GFX9-NEXT: [[ADDF_SUM_1:%.*]] = fadd half [[TEX_HALF_2]], [[TEX_HALF_3]]
+; GFX9-NEXT: [[ADDF_SUM_2:%.*]] = fadd half [[ADDF_SUM_0]], [[ADDF_SUM_1]]
+; GFX9-NEXT: ret half [[ADDF_SUM_2]]
+;
+; GFX10PLUS-LABEL: @image_gather4_2d_v4f32(
+; GFX10PLUS-NEXT: main_body:
+; GFX10PLUS-NEXT: [[TEX:%.*]] = call <4 x half> @llvm.amdgcn.image.gather4.2d.v4f16.f16.v8i32.v4i32(i32 1, half [[S:%.*]], half [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX10PLUS-NEXT: [[TEX_HALF_0:%.*]] = extractelement <4 x half> [[TEX]], i64 0
+; GFX10PLUS-NEXT: [[TEX_HALF_1:%.*]] = extractelement <4 x half> [[TEX]], i64 1
+; GFX10PLUS-NEXT: [[TEX_HALF_2:%.*]] = extractelement <4 x half> [[TEX]], i64 2
+; GFX10PLUS-NEXT:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/141758
More information about the llvm-commits
mailing list