[llvm] [clang] [clang-tools-extra] [AMDGPU][GFX12] Default component broadcast store (PR #76212)

Fri Jan 5 00:12:13 PST 2024

https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76212

>From 06117c6124e94953f62eff3b1b87d98146f9e25e Mon Sep 17 00:00:00 2001
From: Mateja Marjanovic <mmarjano at amd.com>
Date: Wed, 10 May 2023 16:24:38 +0200
Subject: [PATCH 1/2] [AMDGPU][GFX12] Default component broadcast store

For image and buffer stores the default behaviour on GFX12
is to set all unset components to the value of the first component.
So if we pass only X component, it will be the same as XXXX, or XY same as XYXX.

This patch simplifies the passed vector of components in InstCombine
by removing components from the end that are equal to the first component.

For image stores it also trims DMask if necessary.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 36 +++++++++++++++++--
 .../amdgcn-simplify-image-buffer-stores.ll    | 32 ++++++++---------
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2bb7b6bd0674a2..da2f862308558b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
   return DemandedElts;
 }
 
+// Trim elements of the end of the vector \p V, if they are
+// equal to the first element of the vector.
+static APInt defaultComponentBroadcast(Value *V) {
+  auto *VTy = cast<FixedVectorType>(V->getType());
+  unsigned VWidth = VTy->getNumElements();
+  APInt DemandedElts = APInt::getAllOnes(VWidth);
+  Value *FirstComponent = findScalarElement(V, 0);
+
+  SmallVector<int> ShuffleMask;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
+    SVI->getShuffleMask(ShuffleMask);
+
+  for (int I = VWidth - 1; I > 0; --I) {
+    if (ShuffleMask.empty()) {
+      auto *Elt = findScalarElement(V, I);
+      if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
+        break;
+    } else {
+      // Detect identical elements in the shufflevector result, even though
+      // findScalarElement cannot tell us what that element is.
+      if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
+        break;
+    }
+    DemandedElts.clearBit(I);
+  }
+
+  return DemandedElts;
+}
+
 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
                                                     IntrinsicInst &II,
                                                     APInt DemandedElts,
@@ -1140,8 +1169,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
       break;
 
-    APInt DemandedElts =
-        trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+    APInt DemandedElts;
+    if (AMDGPU::isGFX12Plus(*ST))
+      DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
+    else
+      DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
 
     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index f2d904cce7f00d..95b1d09bbd6036 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -23,7 +23,8 @@ define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end(
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -63,9 +64,9 @@ define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; GFX12-NEXT:    call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
+; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
+; GFX12-NEXT:    call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
 ; GFX12-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -84,9 +85,9 @@ define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @struct_buffer_store_format_insert_zeros(
-; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2
-; GFX12-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 2
+; GFX12-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
 ; GFX12-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -140,8 +141,8 @@ define amdgpu_ps void @image_store_1d_store_shufflevector_same(<8 x i32> inreg %
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_shufflevector_same(
-; GFX12-NEXT:    [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float [[TMP1]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
   %data = shufflevector <4 x float> %vdata1, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -155,7 +156,7 @@ define amdgpu_ps void @image_store_1d_store_shufflevector(<8 x i32> inreg %rsrc,
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @image_store_1d_store_shufflevector(
-; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00>, i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
 ;
   %data = shufflevector <4 x float> <float 2.0, float 1.0, float 2.0, float 5.0>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
@@ -172,10 +173,8 @@ define amdgpu_ps void @struct_buffer_store_format_insert_first_at_end(<4 x i32>
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @struct_buffer_store_format_insert_first_at_end(
-; GFX12-NEXT:    [[NEWVDATA2:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    [[NEWVDATA3:%.*]] = insertelement <4 x float> [[NEWVDATA2]], float [[VDATA1]], i64 2
-; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3
-; GFX12-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
 ; GFX12-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
@@ -194,9 +193,8 @@ define amdgpu_ps void @struct_tbuffer_store_insert(<4 x i32> inreg %a, float %vd
 ; GCN-NEXT:    ret void
 ;
 ; GFX12-LABEL: @struct_tbuffer_store_insert(
-; GFX12-NEXT:    [[NEWVDATA3:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 2.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3
-; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 1.000000e+00, float 2.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v3f32(<3 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
 ; GFX12-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0

>From 04f0ba668e9d6558263566fe9a3e990eb417f1b3 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Fri, 5 Jan 2024 08:33:17 +0100
Subject: [PATCH 2/2] Add features for default components {zero,broadcast}

---
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 34 +++++++--
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  6 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  8 +++
 .../amdgcn-simplify-image-buffer-stores.ll    | 69 +++++++++++++++++++
 4 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d2a325d5ad898c..8e1f22ae3e47c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -719,6 +719,18 @@ def FeatureFlatAtomicFaddF32Inst
   "Has flat_atomic_add_f32 instruction"
 >;
 
+def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
+  "HasDefaultComponentZero",
+  "true",
+  "BUFFER/IMAGE store instructions set unspecified components to zero"
+>;
+
+def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast",
+  "HasDefaultComponentBroadcast",
+  "true",
+  "BUFFER/IMAGE store instructions set unspecified components to x component"
+>;
+
 def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support",
   "SupportsSRAMECC",
   "true",
@@ -997,7 +1009,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
   FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
   FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
-  FeatureGDS, FeatureGWS
+  FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
   ]
 >;
 
@@ -1008,7 +1020,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
   FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
-  FeatureImageInsts, FeatureGDS, FeatureGWS
+  FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
   ]
 >;
 
@@ -1023,7 +1035,8 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
    FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
    FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
-   FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS
+   FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero
   ]
 >;
 
@@ -1041,7 +1054,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureNegativeScratchOffsetBug, FeatureGWS
+   FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
   ]
 >;
 
@@ -1061,7 +1074,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
-   FeatureGDS, FeatureGWS
+   FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
   ]
 >;
 
@@ -1081,7 +1094,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
-   FeatureGWS
+   FeatureGWS, FeatureDefaultComponentZero
   ]
 >;
 
@@ -1101,7 +1114,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
    FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureTrue16BitInsts
+   FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
   ]
 >;
 
@@ -2005,6 +2018,13 @@ def HasFlatAtomicFaddF32Inst
   : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
   AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
 
+def HasDefaultComponentZero
+  : Predicate<"Subtarget->hasDefaultComponentZero()">,
+  AssemblerPredicate<(all_of FeatureDefaultComponentZero)>;
+def HasDefaultComponentBroadcast
+  : Predicate<"Subtarget->hasDefaultComponentBroadcast()">,
+  AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>;
+
 def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
   AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index da2f862308558b..898289019c7189 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1170,10 +1170,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       break;
 
     APInt DemandedElts;
-    if (AMDGPU::isGFX12Plus(*ST))
+    if (ST->hasDefaultComponentBroadcast())
       DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
-    else
+    else if (ST->hasDefaultComponentZero())
       DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+    else
+      break;
 
     int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
     if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 91a70930326955..52810ed7ce67ee 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -165,6 +165,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasAtomicCSubNoRtnInsts = false;
   bool HasAtomicGlobalPkAddBF16Inst = false;
   bool HasFlatAtomicFaddF32Inst = false;
+  bool HasDefaultComponentZero = false;
+  bool HasDefaultComponentBroadcast = false;
   bool SupportsSRAMECC = false;
 
   // This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -799,6 +801,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
 
+  bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
+
+  bool hasDefaultComponentBroadcast() const {
+    return HasDefaultComponentBroadcast;
+  }
+
   bool hasNoSdstCMPX() const {
     return HasNoSdstCMPX;
   }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index 95b1d09bbd6036..9cef4a3c7cc0f6 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -mcpu=gfx1010 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
 ; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
 ; RUN: opt -mcpu=gfx1200 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GFXUNKNOWN %s
 
 define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32 %s) #0 {
 ; GCN-LABEL: @image_store_1d_store_all_zeros(
@@ -12,6 +13,10 @@ define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32
 ; GFX12-LABEL: @image_store_1d_store_all_zeros(
 ; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @image_store_1d_store_all_zeros(
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
   ret void
@@ -26,6 +31,11 @@ define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg
 ; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
 ; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @image_store_1d_store_insert_zeros_at_end(
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
@@ -47,6 +57,12 @@ define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> in
 ; GFX12-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
 ; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end(
+; GFXUNKNOWN-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 1
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
@@ -68,6 +84,12 @@ define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %
 ; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
 ; GFX12-NEXT:    call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @buffer_store_format_insert_zeros_at_end(
+; GFXUNKNOWN-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
@@ -89,6 +111,12 @@ define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %
 ; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 2
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_buffer_store_format_insert_zeros(
+; GFXUNKNOWN-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 2
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
@@ -108,6 +136,11 @@ define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32>
 ; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 3
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning(
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 3
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
@@ -127,6 +160,11 @@ define amdgpu_ps void @struct_tbuffer_store_insert_undefs(<4 x i32> inreg %a, fl
 ; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 1.000000e+00>, float [[VDATA1:%.*]], i64 0
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_tbuffer_store_insert_undefs(
+; GFXUNKNOWN-NEXT:    [[NEWVDATA2:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> poison, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 1.0, i32 1
@@ -144,6 +182,11 @@ define amdgpu_ps void @image_store_1d_store_shufflevector_same(<8 x i32> inreg %
 ; GFX12-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VDATA1:%.*]], i64 0
 ; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.f32.i32(float [[TMP1]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @image_store_1d_store_shufflevector_same(
+; GFXUNKNOWN-NEXT:    [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %data = shufflevector <4 x float> %vdata1, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -158,6 +201,10 @@ define amdgpu_ps void @image_store_1d_store_shufflevector(<8 x i32> inreg %rsrc,
 ; GFX12-LABEL: @image_store_1d_store_shufflevector(
 ; GFX12-NEXT:    call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00>, i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @image_store_1d_store_shufflevector(
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> <float 2.000000e+00, float 2.000000e+00, float 5.000000e+00, float 2.000000e+00>, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %data = shufflevector <4 x float> <float 2.0, float 1.0, float 2.0, float 5.0>, <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 2>
   call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -176,6 +223,13 @@ define amdgpu_ps void @struct_buffer_store_format_insert_first_at_end(<4 x i32>
 ; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_buffer_store_format_insert_first_at_end(
+; GFXUNKNOWN-NEXT:    [[NEWVDATA2:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    [[NEWVDATA3:%.*]] = insertelement <4 x float> [[NEWVDATA2]], float [[VDATA1]], i64 2
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
@@ -196,6 +250,12 @@ define amdgpu_ps void @struct_tbuffer_store_insert(<4 x i32> inreg %a, float %vd
 ; GFX12-NEXT:    [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 1.000000e+00, float 2.000000e+00>, float [[VDATA1:%.*]], i64 0
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v3f32(<3 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_tbuffer_store_insert(
+; GFXUNKNOWN-NEXT:    [[NEWVDATA3:%.*]] = insertelement <4 x float> <float poison, float 1.000000e+00, float 2.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
   %newvdata2 = insertelement <4 x float> %newvdata1, float 1.0, i32 1
@@ -213,6 +273,10 @@ define amdgpu_ps void @struct_tbuffer_store_argument(<4 x i32> inreg %a, <4 x fl
 ; GFX12-LABEL: @struct_tbuffer_store_argument(
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[VDATA4:%.*]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_tbuffer_store_argument(
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[VDATA4:%.*]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15)
   ret void
@@ -228,6 +292,11 @@ define amdgpu_ps void @struct_tbuffer_store_argument_insert_first(<4 x i32> inre
 ; GFX12-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[VDATA4:%.*]], float [[VDATA1:%.*]], i64 0
 ; GFX12-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
 ; GFX12-NEXT:    ret void
+;
+; GFXUNKNOWN-LABEL: @struct_tbuffer_store_argument_insert_first(
+; GFXUNKNOWN-NEXT:    [[NEWVDATA4:%.*]] = insertelement <4 x float> [[VDATA4:%.*]], float [[VDATA1:%.*]], i64 0
+; GFXUNKNOWN-NEXT:    call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
+; GFXUNKNOWN-NEXT:    ret void
 ;
   %newvdata4 = insertelement <4 x float> %vdata4, float %vdata1, i32 0
   call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15)