[llvm] [AMDGPU] Optimize image sample followed by llvm.amdgcn.cvt.pkrtz into d16 variant (PR #145203)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 21 20:35:50 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Harrison Hao (harrisonGPU)
<details>
<summary>Changes</summary>
This patch folds the pattern where the result of an `image sample` is passed to
`llvm.amdgcn.cvt.pkrtz`, and the packed result is accessed with
`extractelement <2 x half>, index 0`. In this case, we can emit the d16 variant
of the image intrinsic directly, improving code clarity, avoiding unnecessary
`v_cvt_pk_rtz_f16_f32_e64` instructions, and reducing VGPR usage.
Folding is only applied if:
- The `image sample` result has exactly one use: a call to `amdgcn.cvt.pkrtz`
- The result of `pkrtz` has one use: an extractelement with `index 0`
- We do not fold if index 1 is accessed, since it may contain poison
Example matched pattern:
```llvm
%sample = call float @<!-- -->llvm.amdgcn.image.sample
%pack = call <2 x half> @<!-- -->llvm.amdgcn.cvt.pkrtz(float %sample, float %any)
%h0 = extractelement <2 x half> %pack, i64 0
```
This gets folded into:
```llvm
%sample = call half @<!-- -->llvm.amdgcn.image.sample
```
---
Full diff: https://github.com/llvm/llvm-project/pull/145203.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+36)
- (modified) llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll (+134)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 5477c5eae9392..8a944c58a0ea7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -247,6 +247,42 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
ArgTys[0] = User->getType();
});
}
+
+ // Fold image.sample + cvt.pkrtz -> extractelement idx0 into a single
+ // d16 image sample.
+ // Pattern to match:
+ // %sample = call float @llvm.amdgcn.image.sample...
+ // %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample,
+ // float %any)
+ // %low = extractelement <2 x half> %pack, i64 0
+ // Replacement:
+ // call half @llvm.amdgcn.image.sample
+ //
+ // Folding criteria:
+ // 1. The only user of the image.sample intrinsic is amdgcn.cvt.pkrtz.
+ // 2. That cvt.pkrtz call has exactly one use.
+ // 3. Its sole user is an extractelement instruction with index zero.
+ // Otherwise, folding is not performed, because D16 sampling only
+ // guarantees that the element at index 0 is defined; index 1 is
+ // undefined and using it will result in poison.
+ if (auto *CvtPkrtzCall = dyn_cast<CallInst>(User)) {
+ if (CvtPkrtzCall->getIntrinsicID() == Intrinsic::amdgcn_cvt_pkrtz &&
+ CvtPkrtzCall->hasOneUse()) {
+ // Unique use must be extractelement idx == 0
+ if (auto *Ext =
+ dyn_cast<ExtractElementInst>(*CvtPkrtzCall->user_begin())) {
+ if (isa<ConstantInt>(Ext->getIndexOperand()) &&
+ cast<ConstantInt>(Ext->getIndexOperand())->isZero()) {
+
+ return modifyIntrinsicCall(
+ II, *CvtPkrtzCall, ImageDimIntr->Intr, IC,
+ [&](auto &Args, auto &ArgTys) {
+ ArgTys[0] = CvtPkrtzCall->getType();
+ });
+ }
+ }
+ }
+ }
}
// Only perform D16 folding if every user of the image sample is
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index ee5ccf5af987d..f4f74a84bcb8b 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -239,6 +239,140 @@ main_body:
ret bfloat %res
}
+define amdgpu_ps float @image_sample_2d_single_pkrtz_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_single_pkrtz_to_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
+; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_to_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call half @llvm.amdgcn.image.sample.lz.2d.f16.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[SAMPLE]], [[SAMPLE]]
+; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[SAMPLE]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[SAMPLE]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00)
+ %h0 = extractelement <2 x half> %pack, i64 0
+ %mul = fmul reassoc arcp contract afn half %h0, %h0
+ %div = fdiv reassoc arcp contract afn half %mul, %h0
+ %add = fadd reassoc arcp contract afn half %div, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
+define amdgpu_ps float @image_sample_2d_pkrtz_variable_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_pkrtz_variable_no_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]])
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_variable_no_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float [[V]])
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float %v)
+ %h0 = extractelement <2 x half> %pack, i64 0
+ %h1 = extractelement <2 x half> %pack, i64 1
+ %mul = fmul half %h0, %h1
+ %add = fadd half %mul, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
+define amdgpu_ps float @image_sample_2d_pkrtz_constant_no_fold(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_pkrtz_constant_no_fold(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX7-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX7-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_pkrtz_constant_no_fold(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[SAMPLE]], float 0.000000e+00)
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 0
+; GFX81PLUS-NEXT: [[H1:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd half [[MUL]], [[H0]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %sample, float 0.000000e+00)
+ %h0 = extractelement <2 x half> %pack, i64 0
+ %h1 = extractelement <2 x half> %pack, i64 1
+ %mul = fmul half %h0, %h1
+ %add = fadd half %mul, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
+define amdgpu_ps float @image_sample_2d_single_pkrtz_high_no_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_single_pkrtz_high_no_d16(
+; GFX7-NEXT: main_body:
+; GFX7-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]])
+; GFX7-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX7-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
+; GFX7-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
+; GFX7-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
+; GFX7-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX7-NEXT: ret float [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_single_pkrtz_high_no_d16(
+; GFX81PLUS-NEXT: main_body:
+; GFX81PLUS-NEXT: [[SAMPLE:%.*]] = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT: [[PACK:%.*]] = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float [[SAMPLE]])
+; GFX81PLUS-NEXT: [[H0:%.*]] = extractelement <2 x half> [[PACK]], i64 1
+; GFX81PLUS-NEXT: [[MUL:%.*]] = fmul reassoc arcp contract afn half [[H0]], [[H0]]
+; GFX81PLUS-NEXT: [[DIV:%.*]] = fdiv reassoc arcp contract afn half [[MUL]], [[H0]]
+; GFX81PLUS-NEXT: [[ADD:%.*]] = fadd reassoc arcp contract afn half [[DIV]], [[H0]]
+; GFX81PLUS-NEXT: [[RES:%.*]] = fpext half [[ADD]] to float
+; GFX81PLUS-NEXT: ret float [[RES]]
+;
+main_body:
+ %sample = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32.v8i32.v4i32(i32 1, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %pack = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %sample)
+ %h0 = extractelement <2 x half> %pack, i64 1
+ %mul = fmul reassoc arcp contract afn half %h0, %h0
+ %div = fdiv reassoc arcp contract afn half %mul, %h0
+ %add = fadd reassoc arcp contract afn half %div, %h0
+ %res = fpext half %add to float
+ ret float %res
+}
+
define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
; GFX7-LABEL: @image_gather4_2d_v4f32(
; GFX7-NEXT: main_body:
``````````
</details>
https://github.com/llvm/llvm-project/pull/145203
More information about the llvm-commits
mailing list