[llvm] 88421ea - [AMDGPU] Trim zero components from buffer and image stores

Mon Jun 5 03:30:29 PDT 2023

Author: Mateja Marjanovic
Date: 2023-06-05T12:30:21+02:00
New Revision: 88421ea973916e60c34beb26597a5fc33f83dd8f

URL: https://github.com/llvm/llvm-project/commit/88421ea973916e60c34beb26597a5fc33f83dd8f
DIFF: https://github.com/llvm/llvm-project/commit/88421ea973916e60c34beb26597a5fc33f83dd8f.diff

LOG: [AMDGPU] Trim zero components from buffer and image stores

For image and buffer stores the default behaviour on GFX11 and
older is to set all unset components to zero. So if we pass
only X component it will be the same as X000, or XY same as XY00.

This patch simplifies the passed vector of components in InstCombine
by removing zero components from the end.

For image stores it also trims DMask if necessary.

Reviewed by: arsenm, foad, nhaehnle, piotr

Added: 
    llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 48960ee26e665..6bc751a939d4a 100644

--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -872,10 +872,12 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
 
   defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
               "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
-              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>;
+              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>,
+              AMDGPUImageDMaskIntrinsic;
   defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
               "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
-              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>;
+              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>,
+              AMDGPUImageDMaskIntrinsic;
 
   //////////////////////////////////////////////////////////////////////////
   // MSAA intrinsics

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 50a8f93178741..3d4fd0e1365be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -376,6 +376,36 @@ static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
   return false;
 }
 
+// Trim all zero components from the end of the vector \p UseV and return
+// an appropriate bitset with known elements.
+static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
+                                       Instruction *I) {
+  auto *VTy = cast<FixedVectorType>(UseV->getType());
+  unsigned VWidth = VTy->getNumElements();
+  APInt DemandedElts = APInt::getAllOnes(VWidth);
+
+  for (int i = VWidth - 1; i > 0; --i) {
+    APInt DemandOneElt = APInt::getOneBitSet(VWidth, i);
+    KnownFPClass KnownFPClass =
+        computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(),
+                            /*InterestedClasses=*/fcAllFlags,
+                            /*Depth=*/0, &IC.getTargetLibraryInfo(),
+                            &IC.getAssumptionCache(), I,
+                            &IC.getDominatorTree(),
+                            &IC.getOptimizationRemarkEmitter());
+    if (KnownFPClass.KnownFPClasses != fcPosZero)
+      break;
+    DemandedElts.clearBit(i);
+  }
+  return DemandedElts;
+}
+
+static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+                                                    IntrinsicInst &II,
+                                                    APInt DemandedElts,
+                                                    int DMaskIdx = -1,
+                                                    bool IsLoad = true);
+
 std::optional<Instruction *>
 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   Intrinsic::ID IID = II.getIntrinsicID();
@@ -1035,26 +1065,62 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
     break;
   }
-  default: {
-    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
-            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
-      return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
+  case Intrinsic::amdgcn_buffer_store_format:
+  case Intrinsic::amdgcn_raw_buffer_store_format:
+  case Intrinsic::amdgcn_struct_buffer_store_format:
+  case Intrinsic::amdgcn_raw_tbuffer_store:
+  case Intrinsic::amdgcn_struct_tbuffer_store:
+  case Intrinsic::amdgcn_tbuffer_store:
+  case Intrinsic::amdgcn_image_store_1d:
+  case Intrinsic::amdgcn_image_store_1darray:
+  case Intrinsic::amdgcn_image_store_2d:
+  case Intrinsic::amdgcn_image_store_2darray:
+  case Intrinsic::amdgcn_image_store_2darraymsaa:
+  case Intrinsic::amdgcn_image_store_2dmsaa:
+  case Intrinsic::amdgcn_image_store_3d:
+  case Intrinsic::amdgcn_image_store_cube:
+  case Intrinsic::amdgcn_image_store_mip_1d:
+  case Intrinsic::amdgcn_image_store_mip_1darray:
+  case Intrinsic::amdgcn_image_store_mip_2d:
+  case Intrinsic::amdgcn_image_store_mip_2darray:
+  case Intrinsic::amdgcn_image_store_mip_3d:
+  case Intrinsic::amdgcn_image_store_mip_cube: {
+    if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
+      break;
+
+    APInt DemandedElts =
+        trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+
+    int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
+    if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
+                                              false)) {
+      return IC.eraseInstFromFunction(II);
     }
+
+    break;
+  }
   }
+  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+    return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
   }
   return std::nullopt;
 }
 
 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
 ///
+/// The result of simplifying amdgcn image and buffer store intrinsics is updating
+/// definitions of the intrinsics vector argument, not Uses of the result like
+/// image and buffer loads.
 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
 ///       struct returns.
 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
                                                     IntrinsicInst &II,
                                                     APInt DemandedElts,
-                                                    int DMaskIdx = -1) {
+                                                    int DMaskIdx, bool IsLoad) {
 
-  auto *IIVTy = cast<FixedVectorType>(II.getType());
+  auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
+                                             : II.getOperand(0)->getType());
   unsigned VWidth = IIVTy->getNumElements();
   if (VWidth == 1)
     return nullptr;
@@ -1125,13 +1191,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
     DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
 
     unsigned NewDMaskVal = 0;
-    unsigned OrigLoadIdx = 0;
+    unsigned OrigLdStIdx = 0;
     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
       const unsigned Bit = 1 << SrcIdx;
       if (!!(DMaskVal & Bit)) {
-        if (!!DemandedElts[OrigLoadIdx])
+        if (!!DemandedElts[OrigLdStIdx])
           NewDMaskVal |= Bit;
-        OrigLoadIdx++;
+        OrigLdStIdx++;
       }
     }
 
@@ -1159,29 +1225,45 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
   OverloadTys[0] = NewTy;
 
+  if (!IsLoad) {
+    SmallVector<int, 8> EltMask;
+    for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
+      if (DemandedElts[OrigStoreIdx])
+        EltMask.push_back(OrigStoreIdx);
+
+    if (NewNumElts == 1)
+      Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
+    else
+      Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
+  }
+
   Function *NewIntrin = Intrinsic::getDeclaration(
       II.getModule(), II.getIntrinsicID(), OverloadTys);
   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
   NewCall->takeName(&II);
   NewCall->copyMetadata(II);
 
-  if (NewNumElts == 1) {
-    return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
-                                          DemandedElts.countr_zero());
-  }
+  if (IsLoad) {
+    if (NewNumElts == 1) {
+      return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
+                                            DemandedElts.countr_zero());
+    }
 
-  SmallVector<int, 8> EltMask;
-  unsigned NewLoadIdx = 0;
-  for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
-    if (!!DemandedElts[OrigLoadIdx])
-      EltMask.push_back(NewLoadIdx++);
-    else
-      EltMask.push_back(NewNumElts);
-  }
+    SmallVector<int, 8> EltMask;
+    unsigned NewLoadIdx = 0;
+    for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+      if (!!DemandedElts[OrigLoadIdx])
+        EltMask.push_back(NewLoadIdx++);
+      else
+        EltMask.push_back(NewNumElts);
+    }
+
+    auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
 
-  Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+    return Shuffle;
+  }
 
-  return Shuffle;
+  return NewCall;
 }
 
 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(

diff  --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
new file mode 100644
index 0000000000000..3e5235683d748
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mcpu=gfx900 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
+; RUN: opt -mcpu=gfx1010 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
+; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
+
+define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32 %s) #0 {
+; GCN-LABEL: @image_store_1d_store_all_zeros(
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    ret void
+;
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 {
+; GCN-LABEL: @image_store_1d_store_insert_zeros_at_end(
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    ret void
+;
+  %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
+  %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
+  %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
+  %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, float %vdata2, i32 %s, i32 %mip) #0 {
+; GCN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end(
+; GCN-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 1
+; GCN-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
+; GCN-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GCN-NEXT:    ret void
+;
+  %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
+  %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
+  %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata2, i32 2
+  %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
+  call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) {
+; GCN-LABEL: @buffer_store_format_insert_zeros_at_end(
+; GCN-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[VDATA1:%.*]], i64 0
+; GCN-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
+; GCN-NEXT:    call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
+; GCN-NEXT:    ret void
+;
+  %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
+  %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
+  %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
+  %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) {
+; GCN-LABEL: @struct_buffer_store_format_insert_zeros(
+; GCN-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
+; GCN-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2
+; GCN-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GCN-NEXT:    ret void
+;
+  %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
+  %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
+  %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata1, i32 2
+  %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
+  call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32> inreg %a, float %vdata1, i32 %b) {
+; GCN-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning(
+; GCN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 3
+; GCN-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GCN-NEXT:    ret void
+;
+  %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
+  %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
+  %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
+  %newvdata4 = insertelement <4 x float> %newvdata3, float %vdata1, i32 3
+  call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15)
+  ret void
+}
+
+declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
+declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2
+declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind writeonly }
+attributes #2 = { nounwind }