[llvm] [AMDGPU] Optimize image sample followed by llvm.amdgcn.cvt.pkrtz into d16 variant (PR #145203)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 21 20:35:20 PDT 2025
https://github.com/harrisonGPU created https://github.com/llvm/llvm-project/pull/145203
This patch folds the pattern where the result of an `image sample` is passed to
`llvm.amdgcn.cvt.pkrtz`, and the packed result is accessed with
`extractelement <2 x half>, index 0`. In this case, we can emit the d16 variant
of the image intrinsic directly, improving code clarity, avoiding unnecessary
`v_cvt_pk_rtz_f16_f32_e64` instructions, and reducing VGPR usage.
Folding is only applied if:
- The `image sample` result has exactly one use: a call to `amdgcn.cvt.pkrtz`
- The result of `pkrtz` has one use: an extractelement with `index 0`
- We do not fold if index 1 is accessed, since it may contain poison
Example matched pattern:
```llvm
%sample = call float @llvm.amdgcn.image.sample
%pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float %any)
%h0 = extractelement <2 x half> %pack, i64 0
```
This gets folded into:
```llvm
%sample = call half @llvm.amdgcn.image.sample
```
>From b034a4fd062dd29cd7a51e626523989bd73dc2b7 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Sun, 22 Jun 2025 00:06:25 +0800
Subject: [PATCH] [AMDGPU] Optimize image sample followed by
llvm.amdgcn.cvt.pkrtz into d16 variant
---
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 36 +++++
.../InstCombine/AMDGPU/image-d16.ll | 134 ++++++++++++++++++
2 files changed, 170 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 5477c5eae9392..8a944c58a0ea7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -247,6 +247,42 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
ArgTys[0] = User->getType();
});
}
+
+ // Fold image.sample + cvt.pkrtz -> extractelement idx0 into a single
+ // d16 image sample.
+ // Pattern to match:
+ // %sample = call float @llvm.amdgcn.image.sample...
+ // %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample,
+ // float %any)
+ // %low = extractelement <2 x half> %pack, i64 0
+ // Replacement:
+ // call half @llvm.amdgcn.image.sample
+ //
+ // Folding criteria:
+ // 1. The only user of the image.sample intrinsic is amdgcn.cvt.pkrtz.
+ // 2. That cvt.pkrtz call has exactly one use.
+ // 3. Its sole user is an extractelement instruction with index zero.
+ // Otherwise, folding is not performed, because D16 sampling only
+ // guarantees that the element at index 0 is defined; index 1 is
+ // undefined and using it will result in poison.
+ if (auto *CvtPkrtzCall = dyn_cast<CallInst>(User)) {
+ if (CvtPkrtzCall->getIntrinsicID() == Intrinsic::amdgcn_cvt_pkrtz &&
+ CvtPkrtzCall->hasOneUse()) {
+ // Unique use must be extractelement idx == 0
+ if (auto *Ext =
+ dyn_cast<ExtractElementInst>(*CvtPkrtzCall->user_begin())) {
+ if (isa<ConstantInt>(Ext->getIndexOperand()) &&
+ cast<ConstantInt>(Ext->getIndexOperand())->isZero()) {
+
+ return modifyIntrinsicCall(
+ II, *CvtPkrtzCall, ImageDimIntr->Intr, IC,
+ [&](auto &Args, auto &ArgTys) {
+ ArgTys[0] = CvtPkrtzCall->getType();
+ });
+ }
+ }
+ }
+ }
}
// Only perform D16 folding if every user of the image sample is
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index ee5ccf5af987d..f4f74a84bcb8b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -239,6 +239,140 @@ main_body:
ret bfloat %res
}
+define amdgpu_ps float @image_sample_2d_single_pkrtz_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_single_pkrtz_to_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
+; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_to_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[SAMPLE]], [[SAMPLE]]
+; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[SAMPLE]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[SAMPLE]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00)
+ %h0 = extractelement <2 x half> %pack, i64 0
+ %mul = fmul reassoc arcp contract afn half %h0, %h0
+ %div = fdiv reassoc arcp contract afn half %mul, %h0
+ %add = fadd reassoc arcp contract afn half %div, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
+define amdgpu_ps float @image_sample_2d_pkrtz_variable_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_pkrtz_variable_no_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]])
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_variable_no_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]])
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float %v)
+ %h0 = extractelement <2 x half> %pack, i64 0
+ %h1 = extractelement <2 x half> %pack, i64 1
+ %mul = fmul half %h0, %h1
+ %add = fadd half %mul, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
+define amdgpu_ps float @image_sample_2d_pkrtz_constant_no_fold(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_pkrtz_constant_no_fold(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_constant_no_fold(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00)
+ %h0 = extractelement <2 x half> %pack, i64 0
+ %h1 = extractelement <2 x half> %pack, i64 1
+ %mul = fmul half %h0, %h1
+ %add = fadd half %mul, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
+define amdgpu_ps float @image_sample_2d_single_pkrtz_high_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_single_pkrtz_high_no_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]])
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
+; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_high_no_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]])
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
+; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %sample)
+ %h0 = extractelement <2 x half> %pack, i64 1
+ %mul = fmul reassoc arcp contract afn half %h0, %h0
+ %div = fdiv reassoc arcp contract afn half %mul, %h0
+ %add = fadd reassoc arcp contract afn half %div, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX7-LABEL: @image_gather4_2d_v4f32(
; GFX7-NEXT: main_body:
More information about the llvm-commits
mailing list