[llvm] 2c65908 - [AMDGPU] Don't combine memory intrs to v3i16

Wed Jul 22 03:44:11 PDT 2020

Author: Sebastian Neubauer
Date: 2020-07-22T12:44:01+02:00
New Revision: 2c659082bda6319732118e746fe025d8d5f9bfac

URL: https://github.com/llvm/llvm-project/commit/2c659082bda6319732118e746fe025d8d5f9bfac
DIFF: https://github.com/llvm/llvm-project/commit/2c659082bda6319732118e746fe025d8d5f9bfac.diff

LOG: [AMDGPU] Don't combine memory intrs to v3i16

v3i16 and v3f16 currently cannot be legalized and lowered so they should
not be emitted by inst combining.

Moved the check down to still allow extracting 1 or 2 elements via the dmask.

Fixes image intrinsics being combined to return v3x16.

Differential Revision: https://reviews.llvm.org/D84223

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
    llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7cfe4c8b5892..c7f2f4ec3ca1 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1030,12 +1030,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                            APInt DemandedElts,
                                                            int DMaskIdx) {
 
-  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
-  if (DMaskIdx < 0 &&
-      II->getType()->getScalarSizeInBits() != 32 &&
-      DemandedElts.getActiveBits() == 3)
-    return nullptr;
-
   auto *IIVTy = cast<VectorType>(II->getType());
   unsigned VWidth = IIVTy->getNumElements();
   if (VWidth == 1)
@@ -1124,6 +1118,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
   if (!NewNumElts)
     return UndefValue::get(II->getType());
 
+  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
+  // fully supported.
+  if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
+    return nullptr;
+
   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
     if (DMaskIdx >= 0)
       II->setArgOperand(DMaskIdx, Args[DMaskIdx]);

diff  --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index c2f5e2857be0..3d58f0608081 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -2965,6 +2965,64 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
 ; llvm.amdgcn.image.sample.cd.cl
 ; --------------------------------------------------------------------
 
+; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 3
+  ret half %elt0
+}
+
+; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 2
+  ret half %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 1
+  ret half %elt0
+}
+
+; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32).
+; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x half> %res
+}
+
+; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x half> %res
+}
+
+; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  ret <4 x half> %res
+}
+
 ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
 ; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 ; CHECK-NEXT: ret half %data