[clang] [llvm] [AMDGPU] Restrict `@llvm.amdgcn.image.*` intrinsic's dmask argument (PR #179511)

Wed Mar 11 07:50:19 PDT 2026

https://github.com/AlexeySachkov updated https://github.com/llvm/llvm-project/pull/179511

>From 4539640c220c51f1ec4ce687dafa4029c9b7caf9 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Tue, 3 Feb 2026 11:16:58 -0600
Subject: [PATCH 1/8] [AMDGPU] Fix crash in InstCombine

Added an out-of-bounds check to avoid crashes when simplifying
`@llvm.amdgcn.image.load.*` intrinsics that return vector types and the
mask argument "enables" more elements than there are in the return type.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |   2 +
 .../fix-amdgcn-image-load-dmask-crash.ll      | 123 ++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2cd1902785546..0756a3c257738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1861,6 +1861,8 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
       const unsigned Bit = 1 << SrcIdx;
       if (!!(DMaskVal & Bit)) {
+        if (OrigLdStIdx >= DemandedElts.getBitWidth())
+          break;
         if (!!DemandedElts[OrigLdStIdx])
           NewDMaskVal |= Bit;
         OrigLdStIdx++;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
new file mode 100644
index 0000000000000..571b9e4f123b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple amdgcn -passes=instcombine %s -S -o - | FileCheck %s
+;
+; The main purpose of the test is to ensure that we do not crash when the mask
+; argument "enables" more elements than there are in return type.
+; This specific corner case was discovered by a fuzzer.
+
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; CHECK-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; CHECK-NEXT:    ret [4 x <2 x float>] [[I7]]
+;
+main_body:
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+  %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+  %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+  ret [4 x <2 x float>] %i7
+}
+
+define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; CHECK-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 11, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 -11, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[I3]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[TMP0]], 3
+; CHECK-NEXT:    ret [4 x <3 x float>] [[I7]]
+;
+main_body:
+  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 11, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 -11, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
+  %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
+  %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
+  ret [4 x <3 x float>] %i7
+}
+
+define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; CHECK-NEXT:    ret [4 x <2 x float>] [[I7]]
+;
+main_body:
+  %i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 7, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+  %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+  %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+  ret [4 x <2 x float>] %i7
+}
+
+define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 11, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
+; CHECK-NEXT:    ret [4 x <3 x float>] [[I7]]
+;
+main_body:
+  %i = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
+  %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
+  %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
+  ret [4 x <3 x float>] %i7
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

>From 93e5ccdadd7133acb94772d2ba1ca9afa5c19208 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Wed, 4 Feb 2026 05:16:42 -0600
Subject: [PATCH 2/8] Apply comments

---
 .../fix-amdgcn-image-load-dmask-crash.ll      | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)
 rename llvm/test/{CodeGen => Transforms/InstCombine}/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll (91%)

diff --git a/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
similarity index 91%
rename from llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
rename to llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
index 571b9e4f123b7..c81f1fe7cde9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
@@ -13,7 +13,7 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(<8 x i32> inreg %r
 ; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
@@ -24,7 +24,7 @@ main_body:
   %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i4 = insertvalue [4 x <2 x float>] poison, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
   %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
@@ -40,7 +40,7 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(<8 x i32> inreg %r
 ; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 -11, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[I3]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] poison, <3 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[TMP0]], 3
@@ -51,7 +51,7 @@ main_body:
   %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 -11, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i4 = insertvalue [4 x <3 x float>] poison, <3 x float> %i, 0
   %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
   %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
   %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
@@ -66,7 +66,7 @@ define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(<8 x i32> inr
 ; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
@@ -77,7 +77,7 @@ main_body:
   %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i4 = insertvalue [4 x <2 x float>] poison, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
   %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
@@ -92,7 +92,7 @@ define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(<8 x i32> inr
 ; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] poison, <3 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
@@ -103,21 +103,17 @@ main_body:
   %i1 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i4 = insertvalue [4 x <3 x float>] poison, <3 x float> %i, 0
   %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
   %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
   %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
   ret [4 x <3 x float>] %i7
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

>From 6a3f4d59505e551b746ed1624ab9133f010d9c2d Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Thu, 5 Feb 2026 03:05:14 -0600
Subject: [PATCH 3/8] Catch the corner case by verifier

---
 llvm/lib/IR/Verifier.cpp                      |  33 ++
 .../fix-amdgcn-image-load-dmask-crash.ll      |   4 +-
 .../intrinsic-amdgcn-image-load-dmask.ll      | 307 ++++++++++++++++++
 3 files changed, 343 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3d44d1317ecc7..3e62d9525dacc 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7147,6 +7147,39 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           &Call, Op);
     break;
   }
+  case Intrinsic::amdgcn_image_load_1d:
+  case Intrinsic::amdgcn_image_load_1darray:
+  case Intrinsic::amdgcn_image_load_2d:
+  case Intrinsic::amdgcn_image_load_2darray:
+  case Intrinsic::amdgcn_image_load_2darraymsaa:
+  case Intrinsic::amdgcn_image_load_2dmsaa:
+  case Intrinsic::amdgcn_image_load_3d:
+  case Intrinsic::amdgcn_image_load_cube:
+  case Intrinsic::amdgcn_image_load_mip_1d:
+  case Intrinsic::amdgcn_image_load_mip_1darray:
+  case Intrinsic::amdgcn_image_load_mip_2d:
+  case Intrinsic::amdgcn_image_load_mip_2darray:
+  case Intrinsic::amdgcn_image_load_mip_3d:
+  case Intrinsic::amdgcn_image_load_mip_cube: {
+    // LLVM IR definition of those intrinsics says that they can return any
+    // type. The logic below is based on what is covered by the
+    // llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll test.
+    Type *T = Call.getType();
+    if (auto *ST = dyn_cast<StructType>(Call.getType()))
+      T = ST->getElementType(0);
+
+    unsigned VWidth = 1;
+    if (auto *FVT = dyn_cast<FixedVectorType>(T))
+      VWidth = FVT->getNumElements();
+
+    Value *V = Call.getArgOperand(0);
+    unsigned DMask = cast<ConstantInt>(V)->getZExtValue();
+    unsigned NumActiveBits = popcount(DMask);
+    Check(NumActiveBits <= VWidth,
+          "llvm.amdgcn.image.load.* intrinsic mask cannot have more active "
+          "bits than there are elements in the return type");
+    break;
+  }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
   case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
     Value *V = Call.getArgOperand(0);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
index c81f1fe7cde9a..7f16d69623eed 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple amdgcn -passes=instcombine %s -S -o - | FileCheck %s
+; RUN: opt -mtriple amdgcn -passes=instcombine --disable-verify %s -S -o - | FileCheck %s
+;
+; Verifier is disabled on purpose, because the IR is deemed invalid.
 ;
 ; The main purpose of the test is to ensure that we do not crash when the mask
 ; argument "enables" more elements than there are in return type.
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
new file mode 100644
index 0000000000000..1cb6fe420f320
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
@@ -0,0 +1,307 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_ps void @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 -1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V2_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V1_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 7, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 3, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_tfe_V4(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_tfe_V2(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+
+define amdgpu_ps void @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %lds, <2 x i32> %c) #0 {
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 3, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+
+declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare float @llvm.amdgcn.image.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }

>From 8738f469b243bcef848e4a7cad0b6cf50b8624ab Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Thu, 5 Feb 2026 06:44:25 -0600
Subject: [PATCH 4/8] Adjust clang codegen tests accordingly

---
 clang/test/CodeGen/builtins-image-load.c | 168 +++++++++++------------
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/clang/test/CodeGen/builtins-image-load.c b/clang/test/CodeGen/builtins-image-load.c
index 8442124416338..0efa7725c2363 100644
--- a/clang/test/CodeGen/builtins-image-load.c
+++ b/clang/test/CodeGen/builtins-image-load.c
@@ -24,12 +24,12 @@ typedef half half4 __attribute__((ext_vector_type(4)));
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32.v8i32(i32 12, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 106, i32 103)
+// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.image.load.2d.f32.i32.v8i32(i32 1, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 106, i32 103)
 // CHECK-NEXT:    ret float [[TMP3]]
 //
 float test_builtin_image_load_2d(float f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_2d_f32_i32(12, i32, i32, tex, 106, 103);
+  return __builtin_amdgcn_image_load_2d_f32_i32(1, i32, i32, tex, 106, 103);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_2d_1(
@@ -50,12 +50,12 @@ float test_builtin_image_load_2d(float f32, int i32, __amdgpu_texture_t tex) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float4 test_builtin_image_load_2d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_2d_v4f32_i32(100, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_2d_v4f32_i32(15, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_2d_2(
@@ -76,12 +76,12 @@ float4 test_builtin_image_load_2d_1(float4 v4f32, int i32, __amdgpu_texture_t te
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 half4 test_builtin_image_load_2d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_2d_v4f16_i32(100, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_2d_v4f16_i32(15, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local float @test_builtin_image_load_2darray(
@@ -103,12 +103,12 @@ half4 test_builtin_image_load_2d_2(half4 v4f16, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.load.2darray.f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.load.2darray.f32.i32.v8i32(i32 1, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret float [[TMP4]]
 //
 float test_builtin_image_load_2darray(float f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_2darray_f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_2darray_f32_i32(1, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_2darray_1(
@@ -130,12 +130,12 @@ float test_builtin_image_load_2darray(float f32, int i32, __amdgpu_texture_t tex
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_image_load_2darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_2darray_v4f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_2darray_v4f32_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_2darray_2(
@@ -157,12 +157,12 @@ float4 test_builtin_image_load_2darray_1(float4 v4f32, int i32, __amdgpu_texture
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.2darray.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.2darray.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_image_load_2darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_2darray_v4f16_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_2darray_v4f16_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_1d_1(
@@ -182,12 +182,12 @@ half4 test_builtin_image_load_2darray_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP2]]
 //
 float4 test_builtin_image_load_1d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_1d_v4f32_i32(100, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_1d_v4f32_i32(15, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_1d_2(
@@ -207,12 +207,12 @@ float4 test_builtin_image_load_1d_1(float4 v4f32, int i32, __amdgpu_texture_t te
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
-// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP2]]
 //
 half4 test_builtin_image_load_1d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_1d_v4f16_i32(100, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_1d_v4f16_i32(15, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_1darray_1(
@@ -233,12 +233,12 @@ half4 test_builtin_image_load_1d_2(half4 v4f16, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float4 test_builtin_image_load_1darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_1darray_v4f32_i32(100, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_1darray_v4f32_i32(15, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_1darray_2(
@@ -259,12 +259,12 @@ float4 test_builtin_image_load_1darray_1(float4 v4f32, int i32, __amdgpu_texture
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.load.1darray.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.load.1darray.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 half4 test_builtin_image_load_1darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_1darray_v4f16_i32(100, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_1darray_v4f16_i32(15, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_3d_1(
@@ -286,12 +286,12 @@ half4 test_builtin_image_load_1darray_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_image_load_3d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_3d_v4f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_3d_v4f32_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_3d_2(
@@ -313,12 +313,12 @@ float4 test_builtin_image_load_3d_1(float4 v4f32, int i32, __amdgpu_texture_t te
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_image_load_3d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_3d_v4f16_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_3d_v4f16_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_cube_1(
@@ -340,12 +340,12 @@ half4 test_builtin_image_load_3d_2(half4 v4f16, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_image_load_cube_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_cube_v4f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_cube_v4f32_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_cube_2(
@@ -367,12 +367,12 @@ float4 test_builtin_image_load_cube_1(float4 v4f32, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.cube.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.cube.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_image_load_cube_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_cube_v4f16_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_cube_v4f16_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_mip_1d_1(
@@ -393,12 +393,12 @@ half4 test_builtin_image_load_cube_2(half4 v4f16, int i32, __amdgpu_texture_t te
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float4 test_builtin_image_load_mip_1d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_1d_v4f32_i32(100, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_1d_v4f32_i32(15, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_mip_1d_2(
@@ -419,12 +419,12 @@ float4 test_builtin_image_load_mip_1d_1(float4 v4f32, int i32, __amdgpu_texture_
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.1d.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.1d.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 half4 test_builtin_image_load_mip_1d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_1d_v4f16_i32(100, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_1d_v4f16_i32(15, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_mip_1darray_1(
@@ -446,12 +446,12 @@ half4 test_builtin_image_load_mip_1d_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_image_load_mip_1darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_1darray_v4f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_1darray_v4f32_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_mip_1darray_2(
@@ -473,12 +473,12 @@ float4 test_builtin_image_load_mip_1darray_1(float4 v4f32, int i32, __amdgpu_tex
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.1darray.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.1darray.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_image_load_mip_1darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_1darray_v4f16_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_1darray_v4f16_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local float @test_builtin_image_load_mip_2d(
@@ -500,12 +500,12 @@ half4 test_builtin_image_load_mip_1darray_2(half4 v4f16, int i32, __amdgpu_textu
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.load.mip.2d.f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.load.mip.2d.f32.i32.v8i32(i32 1, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret float [[TMP4]]
 //
 float test_builtin_image_load_mip_2d(float f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_2d_f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_2d_f32_i32(1, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_mip_2d_1(
@@ -527,12 +527,12 @@ float test_builtin_image_load_mip_2d(float f32, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_image_load_mip_2d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_2d_v4f32_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_2d_v4f32_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_mip_2d_2(
@@ -554,12 +554,12 @@ float4 test_builtin_image_load_mip_2d_1(float4 v4f32, int i32, __amdgpu_texture_
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_image_load_mip_2d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_2d_v4f16_i32(100, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_2d_v4f16_i32(15, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local float @test_builtin_image_load_mip_2darray(
@@ -582,12 +582,12 @@ half4 test_builtin_image_load_mip_2d_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.image.load.mip.2darray.f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.image.load.mip.2darray.f32.i32.v8i32(i32 1, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret float [[TMP5]]
 //
 float test_builtin_image_load_mip_2darray(float f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_2darray_f32_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_2darray_f32_i32(1, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_mip_2darray_1(
@@ -610,12 +610,12 @@ float test_builtin_image_load_mip_2darray(float f32, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_builtin_image_load_mip_2darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_2darray_v4f32_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_2darray_v4f32_i32(15, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_mip_2darray_2(
@@ -638,12 +638,12 @@ float4 test_builtin_image_load_mip_2darray_1(float4 v4f32, int i32, __amdgpu_tex
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.2darray.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.2darray.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_builtin_image_load_mip_2darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_2darray_v4f16_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_2darray_v4f16_i32(15, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_mip_3d_1(
@@ -666,12 +666,12 @@ half4 test_builtin_image_load_mip_2darray_2(half4 v4f16, int i32, __amdgpu_textu
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_builtin_image_load_mip_3d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_3d_v4f32_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_3d_v4f32_i32(15, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_mip_3d_2(
@@ -694,12 +694,12 @@ float4 test_builtin_image_load_mip_3d_1(float4 v4f32, int i32, __amdgpu_texture_
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.3d.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.3d.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_builtin_image_load_mip_3d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_3d_v4f16_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_3d_v4f16_i32(15, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_image_load_mip_cube_1(
@@ -722,12 +722,12 @@ half4 test_builtin_image_load_mip_3d_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_builtin_image_load_mip_cube_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_cube_v4f32_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_cube_v4f32_i32(15, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_image_load_mip_cube_2(
@@ -750,12 +750,12 @@ float4 test_builtin_image_load_mip_cube_1(float4 v4f32, int i32, __amdgpu_textur
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.cube.v4f16.i32.v8i32(i32 100, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.load.mip.cube.v4f16.i32.v8i32(i32 15, i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_builtin_image_load_mip_cube_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  return __builtin_amdgcn_image_load_mip_cube_v4f16_i32(100, i32, i32, i32, i32, tex, 120, 110);
+  return __builtin_amdgcn_image_load_mip_cube_v4f16_i32(15, i32, i32, i32, i32, tex, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_amdgcn_image_sample_1d_v4f32_f32(
@@ -782,11 +782,11 @@ half4 test_builtin_image_load_mip_cube_2(half4 v4f16, int i32, __amdgpu_texture_
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
 // CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float4 test_builtin_amdgcn_image_sample_1d_v4f32_f32(float4 v4f32, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_1d_v4f32_f32(100, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_1d_v4f32_f32(15, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_amdgcn_image_sample_1d_v4f16_f32(
@@ -813,11 +813,11 @@ float4 test_builtin_amdgcn_image_sample_1d_v4f32_f32(float4 v4f32, int i32, floa
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
 // CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.1d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 half4 test_builtin_amdgcn_image_sample_1d_v4f16_f32(half4 v4f16, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_1d_v4f16_f32(100, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_1d_v4f16_f32(15, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_amdgcn_image_sample_1darray_v4f32_f32(
@@ -842,11 +842,11 @@ half4 test_builtin_amdgcn_image_sample_1d_v4f16_f32(half4 v4f16, int i32, float
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_amdgcn_image_sample_1darray_v4f32_f32(int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_1darray_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_1darray_v4f32_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_amdgcn_image_sample_1darray_v4f16_f32(
@@ -874,11 +874,11 @@ float4 test_builtin_amdgcn_image_sample_1darray_v4f32_f32(int i32, float f32, __
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.1darray.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_amdgcn_image_sample_1darray_v4f16_f32(half4 v4f16, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_1darray_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_1darray_v4f16_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local float @test_builtin_amdgcn_image_sample_2d_f32_f32(
@@ -903,11 +903,11 @@ half4 test_builtin_amdgcn_image_sample_1darray_v4f16_f32(half4 v4f16, int i32, f
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.amdgcn.image.sample.2d.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret float [[TMP4]]
 //
 float test_builtin_amdgcn_image_sample_2d_f32_f32(int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_2d_f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_2d_f32_f32(1, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_amdgcn_image_sample_2d_v4f32_f32(
@@ -935,11 +935,11 @@ float test_builtin_amdgcn_image_sample_2d_f32_f32(int i32, float f32, __amdgpu_t
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_builtin_amdgcn_image_sample_2d_v4f32_f32(float4 v4f32, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_2d_v4f32_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_amdgcn_image_sample_2d_v4f16_f32(
@@ -967,11 +967,11 @@ float4 test_builtin_amdgcn_image_sample_2d_v4f32_f32(float4 v4f32, int i32, floa
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_builtin_amdgcn_image_sample_2d_v4f16_f32(half4 v4f16, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_2d_v4f16_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local float @test_builtin_amdgcn_image_sample_2darray_f32_f32(
@@ -997,11 +997,11 @@ half4 test_builtin_amdgcn_image_sample_2d_v4f16_f32(half4 v4f16, int i32, float
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.image.sample.2darray.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.image.sample.2darray.f32.f32.v8i32.v4i32(i32 1, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret float [[TMP5]]
 //
 float test_builtin_amdgcn_image_sample_2darray_f32_f32(int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_2darray_f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_2darray_f32_f32(1, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_amdgcn_image_sample_2darray_v4f32_f32(
@@ -1030,11 +1030,11 @@ float test_builtin_amdgcn_image_sample_2darray_f32_f32(int i32, float f32, __amd
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_builtin_amdgcn_image_sample_2darray_v4f32_f32(float4 v4f32, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_2darray_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_2darray_v4f32_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_amdgcn_image_sample_2darray_v4f16_f32(
@@ -1063,11 +1063,11 @@ float4 test_builtin_amdgcn_image_sample_2darray_v4f32_f32(float4 v4f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.2darray.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_builtin_amdgcn_image_sample_2darray_v4f16_f32(half4 v4f16, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_2darray_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_amdgcn_image_sample_3d_v4f32_f32(
@@ -1096,11 +1096,11 @@ half4 test_builtin_amdgcn_image_sample_2darray_v4f16_f32(half4 v4f16, int i32, f
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_builtin_amdgcn_image_sample_3d_v4f32_f32(float4 v4f32, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_3d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_3d_v4f32_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_amdgcn_image_sample_3d_v4f16_f32(
@@ -1129,11 +1129,11 @@ float4 test_builtin_amdgcn_image_sample_3d_v4f32_f32(float4 v4f32, int i32, floa
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.3d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_builtin_amdgcn_image_sample_3d_v4f16_f32(half4 v4f16, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_3d_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_builtin_amdgcn_image_sample_cube_v4f32_f32(
@@ -1162,11 +1162,11 @@ half4 test_builtin_amdgcn_image_sample_3d_v4f16_f32(half4 v4f16, int i32, float
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_builtin_amdgcn_image_sample_cube_v4f32_f32(float4 v4f32, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_cube_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_cube_v4f32_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_builtin_amdgcn_image_sample_cube_v4f16_f32(
@@ -1195,9 +1195,9 @@ float4 test_builtin_amdgcn_image_sample_cube_v4f32_f32(float4 v4f32, int i32, fl
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.cube.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_builtin_amdgcn_image_sample_cube_v4f16_f32(half4 v4f16, int i32, float f32, __amdgpu_texture_t tex, int4 vec4i32) {
-       return __builtin_amdgcn_image_sample_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+       return __builtin_amdgcn_image_sample_cube_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }

>From a63396f36562658e1e795ff8dc2748841ec26881 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Thu, 5 Feb 2026 06:44:40 -0600
Subject: [PATCH 5/8] Add more context to the verifier error

---
 llvm/lib/IR/Verifier.cpp                      |   2 +-
 .../intrinsic-amdgcn-image-load-dmask.ll      | 249 +++++++++---------
 2 files changed, 120 insertions(+), 131 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3e62d9525dacc..b7516681eeb8f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7177,7 +7177,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     unsigned NumActiveBits = popcount(DMask);
     Check(NumActiveBits <= VWidth,
           "llvm.amdgcn.image.load.* intrinsic mask cannot have more active "
-          "bits than there are elements in the return type");
+          "bits than there are elements in the return type", &Call);
     break;
   }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
index 1cb6fe420f320..ac27e96f3b2b0 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
@@ -4,295 +4,284 @@ define amdgpu_ps void @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
 ; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V2_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {<2 x float>, i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_v2f32i32s.i32
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V1_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {float, i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_f32i32s.i32
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.v2f32.i32
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1d.f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 -1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2d.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32
+  ret void
+}
+
+define amdgpu_ps void @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %lds, <2 x i32> %c) #0 {
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 3, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2d.f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.3d.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %r) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.cube.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.cube.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1darray.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %slice) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.1darray.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2darray.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2darray.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2dmsaa.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2dmsaa.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.1d.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %mip) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.1d.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %mip) {
 main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_V2_tfe(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_V1_tfe(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
-main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.sl_v4f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
-  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 7, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  %v = call {<2 x float>, i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 7, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.sl_v2f32i32s.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 3, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_tfe_V4(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
-main_body:
-  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_tfe_V2(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
-main_body:
-  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.sl_f32i32s.i32
   ret void
 }
 
-
 define amdgpu_ps void @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.3d.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.cube.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.1darray.v4f32.i32
   ret void
 }
 
 define amdgpu_ps void @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
-main_body:
-  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
-  ret void
-}
-
-define amdgpu_ps void @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %lds, <2 x i32> %c) #0 {
-  %c0 = extractelement <2 x i32> %c, i32 0
-  %c1 = extractelement <2 x i32> %c, i32 1
-  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 3, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2darray.v4f32.i32
   ret void
 }
 
 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
-declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
-declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {float, i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>, i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
-declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>, i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>, i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1

>From 21540fba1c72c3ba0af7c4b9a3d8797e9de3b128 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Thu, 5 Feb 2026 07:18:14 -0600
Subject: [PATCH 6/8] Revert the original change

---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |   2 -
 .../fix-amdgcn-image-load-dmask-crash.ll      | 121 ------------------
 2 files changed, 123 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 0756a3c257738..2cd1902785546 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1861,8 +1861,6 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
       const unsigned Bit = 1 << SrcIdx;
       if (!!(DMaskVal & Bit)) {
-        if (OrigLdStIdx >= DemandedElts.getBitWidth())
-          break;
         if (!!DemandedElts[OrigLdStIdx])
           NewDMaskVal |= Bit;
         OrigLdStIdx++;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
deleted file mode 100644
index 7f16d69623eed..0000000000000
--- a/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple amdgcn -passes=instcombine --disable-verify %s -S -o - | FileCheck %s
-;
-; Verifier is disabled on purpose, because the IR is deemed invalid.
-;
-; The main purpose of the test is to ensure that we do not crash when the mask
-; argument "enables" more elements than there are in return type.
-; This specific corner case was discovered by a fuzzer.
-
-define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
-; CHECK-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
-; CHECK-NEXT:  [[MAIN_BODY:.*:]]
-; CHECK-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[I]], 0
-; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
-; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
-; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
-; CHECK-NEXT:    ret [4 x <2 x float>] [[I7]]
-;
-main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <2 x float>] poison, <2 x float> %i, 0
-  %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
-  %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
-  %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
-  ret [4 x <2 x float>] %i7
-}
-
-define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
-; CHECK-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
-; CHECK-NEXT:  [[MAIN_BODY:.*:]]
-; CHECK-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 11, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 -11, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[I3]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] poison, <3 x float> [[I]], 0
-; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
-; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
-; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[TMP0]], 3
-; CHECK-NEXT:    ret [4 x <3 x float>] [[I7]]
-;
-main_body:
-  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 11, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 -11, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <3 x float>] poison, <3 x float> %i, 0
-  %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
-  %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
-  %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
-  ret [4 x <3 x float>] %i7
-}
-
-define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
-; CHECK-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
-; CHECK-NEXT:  [[MAIN_BODY:.*:]]
-; CHECK-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[I]], 0
-; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
-; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
-; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
-; CHECK-NEXT:    ret [4 x <2 x float>] [[I7]]
-;
-main_body:
-  %i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 7, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <2 x float>] poison, <2 x float> %i, 0
-  %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
-  %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
-  %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
-  ret [4 x <2 x float>] %i7
-}
-
-define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
-; CHECK-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(
-; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
-; CHECK-NEXT:  [[MAIN_BODY:.*:]]
-; CHECK-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 11, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] poison, <3 x float> [[I]], 0
-; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
-; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
-; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
-; CHECK-NEXT:    ret [4 x <3 x float>] [[I7]]
-;
-main_body:
-  %i = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
-  %i1 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
-  %i2 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
-  %i3 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <3 x float>] poison, <3 x float> %i, 0
-  %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
-  %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
-  %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
-  ret [4 x <3 x float>] %i7
-}
-
-declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
-
-declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
-
-declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
-
-declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

>From 64f4b6d54ef4c3182573071b3d29185f7b175f5a Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Thu, 5 Feb 2026 07:22:07 -0600
Subject: [PATCH 7/8] Apply clang-format

---
 llvm/lib/IR/Verifier.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index b7516681eeb8f..aee9816beb541 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7177,7 +7177,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     unsigned NumActiveBits = popcount(DMask);
     Check(NumActiveBits <= VWidth,
           "llvm.amdgcn.image.load.* intrinsic mask cannot have more active "
-          "bits than there are elements in the return type", &Call);
+          "bits than there are elements in the return type",
+          &Call);
     break;
   }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:

>From 2d5f8e1b51d50abbbe543683a6381b0372847ae6 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Mon, 16 Feb 2026 09:19:54 -0600
Subject: [PATCH 8/8] Improve the diagnostic. Extended to more image intrinsics

---
 clang/test/CodeGen/builtins-extended-image.c  |  92 +--
 clang/test/CodeGen/builtins-image-store.c     | 108 ++--
 llvm/lib/IR/Verifier.cpp                      | 535 +++++++++++++++++-
 .../llvm.amdgcn.image.load.1d.d16.ll          |  30 +-
 .../GlobalISel/llvm.amdgcn.image.load.1d.ll   |  29 +-
 llvm/test/CodeGen/AMDGPU/image-schedule.ll    |   2 +-
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll      |  10 +-
 .../intrinsic-amdgcn-image-load-dmask.ll      |  64 +--
 .../intrinsic-amdgcn-image-sample-dmask.ll    | 416 ++++++++++++++
 ...trinsic-amdgcn-image-sample-noret-dmask.ll | 147 +++++
 .../intrinsic-amdgcn-image-store-dmask.ll     | 157 +++++
 11 files changed, 1422 insertions(+), 168 deletions(-)
 create mode 100644 llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-dmask.ll
 create mode 100644 llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-noret-dmask.ll
 create mode 100644 llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-store-dmask.ll

diff --git a/clang/test/CodeGen/builtins-extended-image.c b/clang/test/CodeGen/builtins-extended-image.c
index 491bbcf7d5412..9ac7ec42d4e50 100644
--- a/clang/test/CodeGen/builtins-extended-image.c
+++ b/clang/test/CodeGen/builtins-extended-image.c
@@ -151,12 +151,12 @@ float4 test_amdgcn_image_gather4_lz_2d_v4f32_f32_a(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
 // CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(100, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_1d_v4f32_f32(15, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_1d_v4f32_f32(
@@ -182,12 +182,12 @@ float4 test_amdgcn_image_sample_lz_1d_v4f32_f32(float4 v4f32, float f32, int i32
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_1d_v4f32_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_d_1d_v4f32_f32(
@@ -214,12 +214,12 @@ float4 test_amdgcn_image_sample_l_1d_v4f32_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_1d_v4f32_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_2d_v4f32_f32(
@@ -245,12 +245,12 @@ float4 test_amdgcn_image_sample_d_1d_v4f32_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP4]]
 //
 float4 test_amdgcn_image_sample_lz_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_2d_v4f32_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_2d_v4f32_f32(
@@ -312,12 +312,12 @@ float4 test_amdgcn_image_sample_l_2d_v4f32_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32
 // CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP8]]
 //
 float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_2d_v4f32_f32(15, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 // CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_lz_3d_v4f32_f32(
 // CHECK-SAME: <4 x float> noundef [[V4F32:%.*]], float noundef [[F32:%.*]], i32 noundef [[I32:%.*]], ptr [[TEX:%.*]], <4 x i32> noundef [[VEC4I32:%.*]]) #[[ATTR0]] {
@@ -343,12 +343,12 @@ float4 test_amdgcn_image_sample_d_2d_v4f32_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.3d.v4f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.lz.3d.v4f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x float> [[TMP5]]
 //
 float4 test_amdgcn_image_sample_lz_3d_v4f32_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_3d_v4f32_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x float> @test_amdgcn_image_sample_l_3d_v4f32_f32(
@@ -706,12 +706,12 @@ float4 test_amdgcn_image_sample_d_2darray_v4f32_f32(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP1]], align 32
 // CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP2]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(100, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_1d_v4f16_f32(15, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_1d_v4f16_f32(
@@ -737,12 +737,12 @@ half4 test_amdgcn_image_sample_lz_1d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_1d_v4f16_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_1d_v4f16_f32(
@@ -769,12 +769,12 @@ half4 test_amdgcn_image_sample_l_1d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1d.v4f16.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_1d_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_2d_v4f16_f32(
@@ -800,12 +800,12 @@ half4 test_amdgcn_image_sample_d_1d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_2d_v4f16_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_2d_v4f16_f32(
@@ -832,12 +832,12 @@ half4 test_amdgcn_image_sample_lz_2d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_2d_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_2d_v4f16_f32(
@@ -867,12 +867,12 @@ half4 test_amdgcn_image_sample_l_2d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP6]], align 32
 // CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2d.v4f16.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP7]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP8]]
 //
 half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_2d_v4f16_f32(15, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_3d_v4f16_f32(
@@ -899,12 +899,12 @@ half4 test_amdgcn_image_sample_d_2d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.3d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_3d_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_3d_v4f16_f32(
@@ -932,12 +932,12 @@ half4 test_amdgcn_image_sample_lz_3d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.3d.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.3d.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP6]]
 //
 half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_3d_v4f16_f32(15, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_3d_v4f16_f32(
@@ -970,12 +970,12 @@ half4 test_amdgcn_image_sample_l_3d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
 // CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.3d.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], float [[TMP8]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP10]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.3d.v4f16.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], float [[TMP7]], float [[TMP8]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP10]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP11]]
 //
 half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_3d_v4f16_f32(15, f32, f32, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_cube_v4f16_f32(
@@ -1002,12 +1002,12 @@ half4 test_amdgcn_image_sample_d_3d_v4f16_f32(float4 v4f32, float f32, int i32,
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.cube.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_cube_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_cube_v4f16_f32(
@@ -1035,12 +1035,12 @@ half4 test_amdgcn_image_sample_lz_cube_v4f16_f32(float4 v4f32, float f32, int i3
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.cube.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.cube.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP6]]
 //
 half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_cube_v4f16_f32(15, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_1darray_v4f16_f32(
@@ -1066,12 +1066,12 @@ half4 test_amdgcn_image_sample_l_cube_v4f16_f32(float4 v4f32, float f32, int i32
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.1darray.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP3]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP4]]
 //
 half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(100, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_1darray_v4f16_f32(15, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_1darray_v4f16_f32(
@@ -1098,12 +1098,12 @@ half4 test_amdgcn_image_sample_lz_1darray_v4f16_f32(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.1darray.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_1darray_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_1darray_v4f16_f32(
@@ -1131,12 +1131,12 @@ half4 test_amdgcn_image_sample_l_1darray_v4f16_f32(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.1darray.v4f16.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP6]]
 //
 half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_1darray_v4f16_f32(15, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_lz_2darray_v4f16_f32(
@@ -1163,12 +1163,12 @@ half4 test_amdgcn_image_sample_d_1darray_v4f16_f32(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.lz.2darray.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP4]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP5]]
 //
 half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(100, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_lz_2darray_v4f16_f32(15, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_l_2darray_v4f16_f32(
@@ -1196,12 +1196,12 @@ half4 test_amdgcn_image_sample_lz_2darray_v4f16_f32(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
 // CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2darray.v4f16.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.l.2darray.v4f16.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP5]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP6]]
 //
 half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(100, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_l_2darray_v4f16_f32(15, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local <4 x half> @test_amdgcn_image_sample_d_2darray_v4f16_f32(
@@ -1232,12 +1232,12 @@ half4 test_amdgcn_image_sample_l_2darray_v4f16_f32(float4 v4f32, float f32, int
 // CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP7]], align 32
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[VEC4I32_ADDR_ASCAST]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2darray.v4f16.f32.f32.v8i32.v4i32(i32 100, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110)
+// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x half> @llvm.amdgcn.image.sample.d.2darray.v4f16.f32.f32.v8i32.v4i32(i32 15, float [[TMP0]], float [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], <8 x i32> [[TEX_RSRC_VAL]], <4 x i32> [[TMP8]], i1 false, i32 120, i32 110)
 // CHECK-NEXT:    ret <4 x half> [[TMP9]]
 //
 half4 test_amdgcn_image_sample_d_2darray_v4f16_f32(float4 v4f32, float f32, int i32, __amdgpu_texture_t tex, int4 vec4i32) {
 
-  return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(100, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
+  return __builtin_amdgcn_image_sample_d_2darray_v4f16_f32(15, f32, f32, f32, f32, f32, f32, f32, tex, vec4i32, 0, 120, 110);
 }
 
 // CHECK-LABEL: define dso_local float @test_amdgcn_image_sample_lz_2d_f32_f32(
diff --git a/clang/test/CodeGen/builtins-image-store.c b/clang/test/CodeGen/builtins-image-store.c
index 5309a16df7033..9a5620209b104 100644
--- a/clang/test/CodeGen/builtins-image-store.c
+++ b/clang/test/CodeGen/builtins-image-store.c
@@ -47,12 +47,12 @@ void test_builtin_image_store_2d(float f32, int i32, __amdgpu_texture_t tex) {
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_2d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_2d_v4f32_i32(v4f32, 100, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_2d_v4f32_i32(v4f32, 15, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_2d_2(
@@ -72,12 +72,12 @@ void test_builtin_image_store_2d_1(float4 v4f32, int i32, __amdgpu_texture_t tex
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_2d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_2d_v4f16_i32(v4f16, 100, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_2d_v4f16_i32(v4f16, 15, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_2darray(
@@ -98,12 +98,12 @@ void test_builtin_image_store_2d_2(half4 v4f16, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.f32.i32.v8i32(float [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.f32.i32.v8i32(float [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_2darray(float f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_2darray_f32_i32(f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_2darray_f32_i32(f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_2darray_1(
@@ -124,12 +124,12 @@ void test_builtin_image_store_2darray(float f32, int i32, __amdgpu_texture_t tex
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_2darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_2darray_v4f32_i32(v4f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_2darray_v4f32_i32(v4f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_2darray_2(
@@ -150,12 +150,12 @@ void test_builtin_image_store_2darray_1(float4 v4f32, int i32, __amdgpu_texture_
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.2darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_2darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-  __builtin_amdgcn_image_store_2darray_v4f16_i32(v4f16, 100, i32, i32, i32, tex, 120, 110);
+  __builtin_amdgcn_image_store_2darray_v4f16_i32(v4f16, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_1d_1(
@@ -174,12 +174,12 @@ void test_builtin_image_store_2darray_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_1d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_1d_v4f32_i32(v4f32, 100, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_1d_v4f32_i32(v4f32, 15, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_1d_2(
@@ -198,12 +198,12 @@ void test_builtin_image_store_1d_1(float4 v4f32, int i32, __amdgpu_texture_t tex
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_1d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_1d_v4f16_i32(v4f16, 100, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_1d_v4f16_i32(v4f16, 15, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_1darray_1(
@@ -223,12 +223,12 @@ void test_builtin_image_store_1d_2(half4 v4f16, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_1darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_1darray_v4f32_i32(v4f32, 100, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_1darray_v4f32_i32(v4f32, 15, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_1darray_2(
@@ -248,12 +248,12 @@ void test_builtin_image_store_1darray_1(float4 v4f32, int i32, __amdgpu_texture_
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.1darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_1darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_1darray_v4f16_i32(v4f16, 100, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_1darray_v4f16_i32(v4f16, 15, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_3d_1(
@@ -274,12 +274,12 @@ void test_builtin_image_store_1darray_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_3d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_3d_v4f32_i32(v4f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_3d_v4f32_i32(v4f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_3d_2(
@@ -300,12 +300,12 @@ void test_builtin_image_store_3d_1(float4 v4f32, int i32, __amdgpu_texture_t tex
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.3d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_3d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_3d_v4f16_i32(v4f16, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_3d_v4f16_i32(v4f16, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_cube_1(
@@ -326,12 +326,12 @@ void test_builtin_image_store_3d_2(half4 v4f16, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_cube_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_cube_v4f32_i32(v4f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_cube_v4f32_i32(v4f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_cube_2(
@@ -352,12 +352,12 @@ void test_builtin_image_store_cube_1(float4 v4f32, int i32, __amdgpu_texture_t t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.cube.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_cube_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_cube_v4f16_i32(v4f16, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_cube_v4f16_i32(v4f16, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_1d_1(
@@ -377,12 +377,12 @@ void test_builtin_image_store_cube_2(half4 v4f16, int i32, __amdgpu_texture_t te
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_1d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_1d_v4f32_i32(v4f32, 100, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_1d_v4f32_i32(v4f32, 15, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_1d_2(
@@ -402,12 +402,12 @@ void test_builtin_image_store_mip_1d_1(float4 v4f32, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP3]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_1d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_1d_v4f16_i32(v4f16, 100, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_1d_v4f16_i32(v4f16, 15, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_1darray_1(
@@ -428,12 +428,12 @@ void test_builtin_image_store_mip_1d_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_1darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_1darray_v4f32_i32(v4f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_1darray_v4f32_i32(v4f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_1darray_2(
@@ -454,12 +454,12 @@ void test_builtin_image_store_mip_1darray_1(float4 v4f32, int i32, __amdgpu_text
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.1darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_1darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_1darray_v4f16_i32(v4f16, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_1darray_v4f16_i32(v4f16, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_2d(
@@ -480,12 +480,12 @@ void test_builtin_image_store_mip_1darray_2(half4 v4f16, int i32, __amdgpu_textu
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2d.f32.i32.v8i32(float [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2d.f32.i32.v8i32(float [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_2d(float f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_2d_f32_i32(f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_2d_f32_i32(f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_2d_1(
@@ -506,12 +506,12 @@ void test_builtin_image_store_mip_2d(float f32, int i32, __amdgpu_texture_t tex)
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_2d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_2d_v4f32_i32(v4f32, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_2d_v4f32_i32(v4f32, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_2d_2(
@@ -532,12 +532,12 @@ void test_builtin_image_store_mip_2d_1(float4 v4f32, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP4]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_2d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_2d_v4f16_i32(v4f16, 100, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_2d_v4f16_i32(v4f16, 15, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_2darray(
@@ -559,12 +559,12 @@ void test_builtin_image_store_mip_2d_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2darray.f32.i32.v8i32(float [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2darray.f32.i32.v8i32(float [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_2darray(float f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_2darray_f32_i32(f32, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_2darray_f32_i32(f32, 15, i32, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_2darray_1(
@@ -586,12 +586,12 @@ void test_builtin_image_store_mip_2darray(float f32, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_2darray_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_2darray_v4f32_i32(v4f32, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_2darray_v4f32_i32(v4f32, 15, i32, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_2darray_2(
@@ -613,12 +613,12 @@ void test_builtin_image_store_mip_2darray_1(float4 v4f32, int i32, __amdgpu_text
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.2darray.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_2darray_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_2darray_v4f16_i32(v4f16, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_2darray_v4f16_i32(v4f16, 15, i32, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_3d_1(
@@ -640,12 +640,12 @@ void test_builtin_image_store_mip_2darray_2(half4 v4f16, int i32, __amdgpu_textu
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_3d_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_3d_v4f32_i32(v4f32, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_3d_v4f32_i32(v4f32, 15, i32, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_3d_2(
@@ -667,12 +667,12 @@ void test_builtin_image_store_mip_3d_1(float4 v4f32, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.3d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.3d.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_3d_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_3d_v4f16_i32(v4f16, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_3d_v4f16_i32(v4f16, 15, i32, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_cube_1(
@@ -694,12 +694,12 @@ void test_builtin_image_store_mip_3d_2(half4 v4f16, int i32, __amdgpu_texture_t
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32.v8i32(<4 x float> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_cube_1(float4 v4f32, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_cube_v4f32_i32(v4f32, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_cube_v4f32_i32(v4f32, 15, i32, i32, i32, i32, tex, 120, 110);
  }
 
 // CHECK-LABEL: define dso_local void @test_builtin_image_store_mip_cube_2(
@@ -721,10 +721,10 @@ void test_builtin_image_store_mip_cube_1(float4 v4f32, int i32, __amdgpu_texture
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I32_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TEX_ADDR_ASCAST]], align 32
 // CHECK-NEXT:    [[TEX_RSRC_VAL:%.*]] = load <8 x i32>, ptr [[TMP5]], align 32
-// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.cube.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 100, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
+// CHECK-NEXT:    call void @llvm.amdgcn.image.store.mip.cube.v4f16.i32.v8i32(<4 x half> [[TMP0]], i32 15, i32 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], i32 [[TMP4]], <8 x i32> [[TEX_RSRC_VAL]], i32 120, i32 110)
 // CHECK-NEXT:    ret void
 //
 void test_builtin_image_store_mip_cube_2(half4 v4f16, int i32, __amdgpu_texture_t tex) {
 
-   __builtin_amdgcn_image_store_mip_cube_v4f16_i32(v4f16, 100, i32, i32, i32, i32, tex, 120, 110);
+   __builtin_amdgcn_image_store_mip_cube_v4f16_i32(v4f16, 15, i32, i32, i32, i32, tex, 120, 110);
  }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 92fcbe0b46dce..1073b39a4429d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7164,7 +7164,487 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::amdgcn_image_load_mip_2d:
   case Intrinsic::amdgcn_image_load_mip_2darray:
   case Intrinsic::amdgcn_image_load_mip_3d:
-  case Intrinsic::amdgcn_image_load_mip_cube: {
+  case Intrinsic::amdgcn_image_load_mip_cube:
+  case Intrinsic::amdgcn_image_sample_1d:
+  case Intrinsic::amdgcn_image_sample_1darray:
+  case Intrinsic::amdgcn_image_sample_2d:
+  case Intrinsic::amdgcn_image_sample_2darray:
+  case Intrinsic::amdgcn_image_sample_3d:
+  case Intrinsic::amdgcn_image_sample_b_1d:
+  case Intrinsic::amdgcn_image_sample_b_1darray:
+  case Intrinsic::amdgcn_image_sample_b_2d:
+  case Intrinsic::amdgcn_image_sample_b_2darray:
+  case Intrinsic::amdgcn_image_sample_b_3d:
+  case Intrinsic::amdgcn_image_sample_b_cl_1d:
+  case Intrinsic::amdgcn_image_sample_b_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_b_cl_2d:
+  case Intrinsic::amdgcn_image_sample_b_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_b_cl_3d:
+  case Intrinsic::amdgcn_image_sample_b_cl_cube:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_b_cube:
+  case Intrinsic::amdgcn_image_sample_b_o_1d:
+  case Intrinsic::amdgcn_image_sample_b_o_1darray:
+  case Intrinsic::amdgcn_image_sample_b_o_2d:
+  case Intrinsic::amdgcn_image_sample_b_o_2darray:
+  case Intrinsic::amdgcn_image_sample_b_o_3d:
+  case Intrinsic::amdgcn_image_sample_b_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_1d:
+  case Intrinsic::amdgcn_image_sample_c_1darray:
+  case Intrinsic::amdgcn_image_sample_c_2d:
+  case Intrinsic::amdgcn_image_sample_c_2darray:
+  case Intrinsic::amdgcn_image_sample_c_3d:
+  case Intrinsic::amdgcn_image_sample_c_b_1d:
+  case Intrinsic::amdgcn_image_sample_c_b_1darray:
+  case Intrinsic::amdgcn_image_sample_c_b_2d:
+  case Intrinsic::amdgcn_image_sample_c_b_2darray:
+  case Intrinsic::amdgcn_image_sample_c_b_3d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_1d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_2d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_3d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_cube:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_b_cube:
+  case Intrinsic::amdgcn_image_sample_c_b_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_b_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_b_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_b_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_b_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_b_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_cd_1d:
+  case Intrinsic::amdgcn_image_sample_c_cd_1darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_2d:
+  case Intrinsic::amdgcn_image_sample_c_cd_2darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_3d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_1d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_2d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_3d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_cube:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_cd_cube:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_cl_1d:
+  case Intrinsic::amdgcn_image_sample_c_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_c_cl_2d:
+  case Intrinsic::amdgcn_image_sample_c_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_c_cl_3d:
+  case Intrinsic::amdgcn_image_sample_c_cl_cube:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_cube:
+  case Intrinsic::amdgcn_image_sample_c_d_1d:
+  case Intrinsic::amdgcn_image_sample_c_d_1darray:
+  case Intrinsic::amdgcn_image_sample_c_d_2d:
+  case Intrinsic::amdgcn_image_sample_c_d_2darray:
+  case Intrinsic::amdgcn_image_sample_c_d_3d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_1d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_2d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_3d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_cube:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_d_cube:
+  case Intrinsic::amdgcn_image_sample_c_d_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_d_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_d_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_d_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_d_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_d_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_l_1d:
+  case Intrinsic::amdgcn_image_sample_c_l_1darray:
+  case Intrinsic::amdgcn_image_sample_c_l_2d:
+  case Intrinsic::amdgcn_image_sample_c_l_2darray:
+  case Intrinsic::amdgcn_image_sample_c_l_3d:
+  case Intrinsic::amdgcn_image_sample_c_l_cube:
+  case Intrinsic::amdgcn_image_sample_c_l_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_l_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_l_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_l_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_l_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_l_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_lz_1d:
+  case Intrinsic::amdgcn_image_sample_c_lz_1darray:
+  case Intrinsic::amdgcn_image_sample_c_lz_2d:
+  case Intrinsic::amdgcn_image_sample_c_lz_2darray:
+  case Intrinsic::amdgcn_image_sample_c_lz_3d:
+  case Intrinsic::amdgcn_image_sample_c_lz_cube:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_cube:
+  case Intrinsic::amdgcn_image_sample_c_o_1d:
+  case Intrinsic::amdgcn_image_sample_c_o_1darray:
+  case Intrinsic::amdgcn_image_sample_c_o_2d:
+  case Intrinsic::amdgcn_image_sample_c_o_2darray:
+  case Intrinsic::amdgcn_image_sample_c_o_3d:
+  case Intrinsic::amdgcn_image_sample_c_o_cube:
+  case Intrinsic::amdgcn_image_sample_cd_1d:
+  case Intrinsic::amdgcn_image_sample_cd_1darray:
+  case Intrinsic::amdgcn_image_sample_cd_2d:
+  case Intrinsic::amdgcn_image_sample_cd_2darray:
+  case Intrinsic::amdgcn_image_sample_cd_3d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_1d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_cd_cl_2d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_cd_cl_3d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_cube:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_cd_cube:
+  case Intrinsic::amdgcn_image_sample_cd_o_1d:
+  case Intrinsic::amdgcn_image_sample_cd_o_1darray:
+  case Intrinsic::amdgcn_image_sample_cd_o_2d:
+  case Intrinsic::amdgcn_image_sample_cd_o_2darray:
+  case Intrinsic::amdgcn_image_sample_cd_o_3d:
+  case Intrinsic::amdgcn_image_sample_cd_o_cube:
+  case Intrinsic::amdgcn_image_sample_cl_1d:
+  case Intrinsic::amdgcn_image_sample_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_cl_2d:
+  case Intrinsic::amdgcn_image_sample_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_cl_3d:
+  case Intrinsic::amdgcn_image_sample_cl_cube:
+  case Intrinsic::amdgcn_image_sample_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_cube:
+  case Intrinsic::amdgcn_image_sample_d_1d:
+  case Intrinsic::amdgcn_image_sample_d_1darray:
+  case Intrinsic::amdgcn_image_sample_d_2d:
+  case Intrinsic::amdgcn_image_sample_d_2darray:
+  case Intrinsic::amdgcn_image_sample_d_3d:
+  case Intrinsic::amdgcn_image_sample_d_cl_1d:
+  case Intrinsic::amdgcn_image_sample_d_cl_1darray:
+  case Intrinsic::amdgcn_image_sample_d_cl_2d:
+  case Intrinsic::amdgcn_image_sample_d_cl_2darray:
+  case Intrinsic::amdgcn_image_sample_d_cl_3d:
+  case Intrinsic::amdgcn_image_sample_d_cl_cube:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_1d:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_1darray:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_2d:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_2darray:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_3d:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_cube:
+  case Intrinsic::amdgcn_image_sample_d_cube:
+  case Intrinsic::amdgcn_image_sample_d_o_1d:
+  case Intrinsic::amdgcn_image_sample_d_o_1darray:
+  case Intrinsic::amdgcn_image_sample_d_o_2d:
+  case Intrinsic::amdgcn_image_sample_d_o_2darray:
+  case Intrinsic::amdgcn_image_sample_d_o_3d:
+  case Intrinsic::amdgcn_image_sample_d_o_cube:
+  case Intrinsic::amdgcn_image_sample_l_1d:
+  case Intrinsic::amdgcn_image_sample_l_1darray:
+  case Intrinsic::amdgcn_image_sample_l_2d:
+  case Intrinsic::amdgcn_image_sample_l_2darray:
+  case Intrinsic::amdgcn_image_sample_l_3d:
+  case Intrinsic::amdgcn_image_sample_l_cube:
+  case Intrinsic::amdgcn_image_sample_l_o_1d:
+  case Intrinsic::amdgcn_image_sample_l_o_1darray:
+  case Intrinsic::amdgcn_image_sample_l_o_2d:
+  case Intrinsic::amdgcn_image_sample_l_o_2darray:
+  case Intrinsic::amdgcn_image_sample_l_o_3d:
+  case Intrinsic::amdgcn_image_sample_l_o_cube:
+  case Intrinsic::amdgcn_image_sample_lz_1d:
+  case Intrinsic::amdgcn_image_sample_lz_1darray:
+  case Intrinsic::amdgcn_image_sample_lz_2d:
+  case Intrinsic::amdgcn_image_sample_lz_2darray:
+  case Intrinsic::amdgcn_image_sample_lz_3d:
+  case Intrinsic::amdgcn_image_sample_lz_cube:
+  case Intrinsic::amdgcn_image_sample_lz_o_1d:
+  case Intrinsic::amdgcn_image_sample_lz_o_1darray:
+  case Intrinsic::amdgcn_image_sample_lz_o_2d:
+  case Intrinsic::amdgcn_image_sample_lz_o_2darray:
+  case Intrinsic::amdgcn_image_sample_lz_o_3d:
+  case Intrinsic::amdgcn_image_sample_lz_o_cube:
+  case Intrinsic::amdgcn_image_sample_o_1d:
+  case Intrinsic::amdgcn_image_sample_o_1darray:
+  case Intrinsic::amdgcn_image_sample_o_2d:
+  case Intrinsic::amdgcn_image_sample_o_2darray:
+  case Intrinsic::amdgcn_image_sample_o_3d:
+  case Intrinsic::amdgcn_image_sample_o_cube:
+  case Intrinsic::amdgcn_image_sample_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_b_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_b_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_b_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_b_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_b_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cd_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_d_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_l_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_lz_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_c_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_c_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_c_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_cd_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_d_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cl_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_d_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_d_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_d_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_d_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_l_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_l_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_l_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_l_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_l_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_l_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_l_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_l_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_l_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_l_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_l_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_l_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_lz_o_cube_nortn:
+  case Intrinsic::amdgcn_image_sample_o_1d_nortn:
+  case Intrinsic::amdgcn_image_sample_o_1darray_nortn:
+  case Intrinsic::amdgcn_image_sample_o_2d_nortn:
+  case Intrinsic::amdgcn_image_sample_o_2darray_nortn:
+  case Intrinsic::amdgcn_image_sample_o_3d_nortn:
+  case Intrinsic::amdgcn_image_sample_o_cube_nortn: {
     // LLVM IR definition of those intrinsics says that they can return any
     // type. The logic below is based on what is covered by the
     // llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll test.
@@ -7177,12 +7657,53 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       VWidth = FVT->getNumElements();
 
     Value *V = Call.getArgOperand(0);
-    unsigned DMask = cast<ConstantInt>(V)->getZExtValue();
-    unsigned NumActiveBits = popcount(DMask);
-    Check(NumActiveBits <= VWidth,
-          "llvm.amdgcn.image.load.* intrinsic mask cannot have more active "
-          "bits than there are elements in the return type",
-          &Call);
+    // Only 4 least significant bits of the dmask are considered, the rest is
+    // ignored.
+    unsigned PassedDMask = cast<ConstantInt>(V)->getZExtValue();
+    unsigned MeaningfulDMask = PassedDMask & 0xf;
+    Check(
+        PassedDMask == MeaningfulDMask,
+        "DMask is a 4 bit value and therefore at most 4 least significant "
+        "bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set",
+        &Call);
+    if (!T->isVoidTy()) {
+      // We don't know the destination for no-return intrinsics and therefore
+      // cannot perform this check
+      unsigned NumActiveBits = popcount(MeaningfulDMask);
+      Check(NumActiveBits <= VWidth,
+            "The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument "
+            "cannot have more active bits than there are elements in the "
+            "return type",
+            &Call);
+    }
+    break;
+  }
+  case Intrinsic::amdgcn_image_store_1d:
+  case Intrinsic::amdgcn_image_store_1darray:
+  case Intrinsic::amdgcn_image_store_2d:
+  case Intrinsic::amdgcn_image_store_2darray:
+  case Intrinsic::amdgcn_image_store_2darraymsaa:
+  case Intrinsic::amdgcn_image_store_2dmsaa:
+  case Intrinsic::amdgcn_image_store_3d:
+  case Intrinsic::amdgcn_image_store_cube:
+  case Intrinsic::amdgcn_image_store_mip_1d:
+  case Intrinsic::amdgcn_image_store_mip_1darray:
+  case Intrinsic::amdgcn_image_store_mip_2d:
+  case Intrinsic::amdgcn_image_store_mip_2darray:
+  case Intrinsic::amdgcn_image_store_mip_3d:
+  case Intrinsic::amdgcn_image_store_mip_cube: {
+    // DMask is the second argument in store instrinsics
+    Value *V = Call.getArgOperand(1);
+    unsigned PassedDMask = cast<ConstantInt>(V)->getZExtValue();
+    unsigned MeaningfulDMask = PassedDMask & 0xf;
+    Check(
+        PassedDMask == MeaningfulDMask,
+        "DMask is a 4 bit value and therefore at most 4 least significant "
+        "bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set",
+        &Call);
+    // Store instructions can attempt to store more components than there are
+    // in an image: missing components will receive the value of X component.
+    // As such, any combination of DMask bits is allowed contrary to loads.
     break;
   }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 85ab4c99c73ae..f139b6c911ea5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -1064,9 +1064,12 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s6, s8
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-UNPACKED-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe d16
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v4, v1
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-UNPACKED-NEXT:    image_load v[1:5], v0, s[0:7] dmask:0xf unorm tfe d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-UNPACKED-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-PACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw:
@@ -1081,9 +1084,10 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX8-PACKED-NEXT:    s_mov_b32 s6, s8
 ; GFX8-PACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-PACKED-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-PACKED-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe d16
+; GFX8-PACKED-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-PACKED-NEXT:    image_load v[1:3], v0, s[0:7] dmask:0xf unorm tfe d16
 ; GFX8-PACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-PACKED-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-PACKED-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-PACKED-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: load_1d_v4f16_tfe_dmask_xyzw:
@@ -1098,9 +1102,10 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX9-NEXT:    s_mov_b32 s6, s8
 ; GFX9-NEXT:    s_mov_b32 s7, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v1
-; GFX9-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe d16
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    image_load v[1:3], v0, s[0:7] dmask:0xf unorm tfe d16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: load_1d_v4f16_tfe_dmask_xyzw:
@@ -1115,9 +1120,10 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX10PLUS-NEXT:    s_mov_b32 s6, s8
 ; GFX10PLUS-NEXT:    s_mov_b32 s7, s9
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10PLUS-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10PLUS-NEXT:    image_load v[1:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe d16
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
-; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_1d_v4f16_tfe_dmask_xyzw:
@@ -1131,13 +1137,13 @@ define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    v_mov_b32_e32 v2, v1
-; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D tfe d16
+; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    image_load v[1:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D tfe d16
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX12-NEXT:    ; return to shader part epilog
-  %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v = call { <4 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f16i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.err = extractvalue { <4 x half>, i32 } %v, 1
   %vv = bitcast i32 %v.err to float
   ret float %vv
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
index fc48664c96cee..d7f0ec9a3254b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll
@@ -848,9 +848,12 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX68-NEXT:    s_mov_b32 s6, s8
 ; GFX68-NEXT:    s_mov_b32 s7, s9
 ; GFX68-NEXT:    v_mov_b32_e32 v2, v1
-; GFX68-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 unorm tfe
+; GFX68-NEXT:    v_mov_b32_e32 v3, v1
+; GFX68-NEXT:    v_mov_b32_e32 v4, v1
+; GFX68-NEXT:    v_mov_b32_e32 v5, v1
+; GFX68-NEXT:    image_load v[1:5], v0, s[0:7] dmask:0xf unorm tfe
 ; GFX68-NEXT:    s_waitcnt vmcnt(0)
-; GFX68-NEXT:    v_mov_b32_e32 v0, v2
+; GFX68-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX68-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: load_1d_v4f32_tfe_dmask_xyzw:
@@ -865,9 +868,12 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX10-NEXT:    s_mov_b32 s6, s8
 ; GFX10-NEXT:    s_mov_b32 s7, s9
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    image_load v[1:5], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; NOPRT-LABEL: load_1d_v4f32_tfe_dmask_xyzw:
@@ -880,10 +886,10 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; NOPRT-NEXT:    s_mov_b32 s5, s7
 ; NOPRT-NEXT:    s_mov_b32 s6, s8
 ; NOPRT-NEXT:    s_mov_b32 s7, s9
-; NOPRT-NEXT:    v_mov_b32_e32 v1, 0
-; NOPRT-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D unorm tfe
+; NOPRT-NEXT:    v_mov_b32_e32 v4, 0
+; NOPRT-NEXT:    image_load v[0:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe
 ; NOPRT-NEXT:    s_waitcnt vmcnt(0)
-; NOPRT-NEXT:    v_mov_b32_e32 v0, v1
+; NOPRT-NEXT:    v_mov_b32_e32 v0, v4
 ; NOPRT-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: load_1d_v4f32_tfe_dmask_xyzw:
@@ -897,13 +903,14 @@ define amdgpu_ps float @load_1d_v4f32_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32
 ; GFX12-NEXT:    s_mov_b32 s5, s7
 ; GFX12-NEXT:    s_mov_b32 s6, s8
 ; GFX12-NEXT:    s_mov_b32 s7, s9
-; GFX12-NEXT:    v_mov_b32_e32 v2, v1
-; GFX12-NEXT:    image_load v[1:2], v0, s[0:7] dmask:0x10 dim:SQ_RSRC_IMG_1D tfe
+; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v1
+; GFX12-NEXT:    image_load v[1:5], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D tfe
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    ; return to shader part epilog
-  %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 16, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
   %v.err = extractvalue { <4 x float>, i32 } %v, 1
   %vv = bitcast i32 %v.err to float
   ret float %vv
diff --git a/llvm/test/CodeGen/AMDGPU/image-schedule.ll b/llvm/test/CodeGen/AMDGPU/image-schedule.ll
index 9c44b7b34e305..fb83daaa1d08c 100644
--- a/llvm/test/CodeGen/AMDGPU/image-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-schedule.ll
@@ -28,7 +28,7 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1
   %tmp16 = load <8 x i32>, ptr addrspace(4) %tmp15, align 16
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %tmp14, i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp16, i32 0, i32 0) #0
   %tmp17 = load <8 x i32>, ptr addrspace(4) %tmp15, align 16
-  %tmp18 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 165, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp17, i32 0, i32 0) #0
+  %tmp18 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp17, i32 0, i32 0) #0
   %tmp19 = getelementptr [4294967295 x i8], ptr addrspace(4) %tmp8, i64 0, i64 64
   %tmp21 = load <8 x i32>, ptr addrspace(4) %tmp19, align 16
   call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %tmp18, i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp21, i32 0, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 4b4fe73826e01..5a212cd82b70f 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1411,7 +1411,7 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; SI-NEXT:    s_andn2_b64 exec, exec, vcc
 ; SI-NEXT:  .LBB13_3: ; %bb4
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
-; SI-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10
+; SI-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1445,7 +1445,7 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
 ; GFX10-WAVE64-NEXT:  .LBB13_3: ; %bb4
 ; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX10-WAVE64-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
+; GFX10-WAVE64-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-WAVE64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
@@ -1477,7 +1477,7 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
 ; GFX10-WAVE32-NEXT:  .LBB13_3: ; %bb4
 ; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-WAVE32-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
+; GFX10-WAVE32-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1511,7 +1511,7 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX11-NEXT:  .LBB13_3: ; %bb4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX11-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
+; GFX11-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmpx_neq_f32_e32 0, v0
@@ -1536,7 +1536,7 @@ bb3:                                              ; preds = %bb
   br label %bb4
 
 bb4:                                              ; preds = %bb3, %bb
-  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 16, float %arg2, float %arg3, <8 x i32> poison, <4 x i32> poison, i1 0, i32 0, i32 0)
+  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %arg2, float %arg3, <8 x i32> poison, <4 x i32> poison, i1 0, i32 0, i32 0)
   %tmp6 = extractelement <4 x float> %tmp5, i32 0
   %tmp7 = fcmp une float %tmp6, 0.000000e+00
   br i1 %tmp7, label %bb8, label %bb9
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
index ac27e96f3b2b0..e365d3973325c 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
@@ -3,7 +3,7 @@
 define amdgpu_ps void @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.v4f32.i32
   ret void
 }
@@ -11,7 +11,7 @@ main_body:
 define amdgpu_ps void @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32
   ret void
 }
@@ -19,7 +19,7 @@ main_body:
 define amdgpu_ps void @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_v4f32i32s.i32
   ret void
 }
@@ -27,7 +27,7 @@ main_body:
 define amdgpu_ps void @load_1d_V2_tfe(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call {<2 x float>, i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_v2f32i32s.i32
   ret void
 }
@@ -35,7 +35,7 @@ main_body:
 define amdgpu_ps void @load_1d_V1_tfe(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call {float, i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.sl_f32i32s.i32
   ret void
 }
@@ -43,7 +43,7 @@ main_body:
 define amdgpu_ps void @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.v2f32.i32
   ret void
 }
@@ -51,7 +51,7 @@ main_body:
 define amdgpu_ps void @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
 main_body:
   %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1d.f32.i32
   ret void
 }
@@ -59,7 +59,7 @@ main_body:
 define amdgpu_ps void @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 -1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2d.v4f32.i32
   ret void
 }
@@ -67,7 +67,7 @@ main_body:
 define amdgpu_ps void @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32
   ret void
 }
@@ -76,7 +76,7 @@ define amdgpu_ps void @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %l
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
   %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 3, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2d.f32.i32
   ret void
 }
@@ -84,7 +84,7 @@ define amdgpu_ps void @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %l
 define amdgpu_ps void @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.3d.v4f32.i32
   ret void
 }
@@ -92,7 +92,7 @@ main_body:
 define amdgpu_ps void @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %r) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32
   ret void
 }
@@ -100,7 +100,7 @@ main_body:
 define amdgpu_ps void @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.cube.v4f32.i32
   ret void
 }
@@ -108,7 +108,7 @@ main_body:
 define amdgpu_ps void @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.cube.sl_v4f32i32s.i32
   ret void
 }
@@ -116,7 +116,7 @@ main_body:
 define amdgpu_ps void @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1darray.v4f32.i32
   ret void
 }
@@ -124,7 +124,7 @@ main_body:
 define amdgpu_ps void @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %slice) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.1darray.sl_v4f32i32s.i32
   ret void
 }
@@ -132,7 +132,7 @@ main_body:
 define amdgpu_ps void @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2darray.v4f32.i32
   ret void
 }
@@ -140,7 +140,7 @@ main_body:
 define amdgpu_ps void @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2darray.sl_v4f32i32s.i32
   ret void
 }
@@ -148,7 +148,7 @@ main_body:
 define amdgpu_ps void @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2dmsaa.v4f32.i32
   ret void
 }
@@ -156,7 +156,7 @@ main_body:
 define amdgpu_ps void @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2dmsaa.sl_v4f32i32s.i32
   ret void
 }
@@ -164,7 +164,7 @@ main_body:
 define amdgpu_ps void @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32
   ret void
 }
@@ -172,7 +172,7 @@ main_body:
 define amdgpu_ps void @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32
   ret void
 }
@@ -180,7 +180,7 @@ main_body:
 define amdgpu_ps void @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.1d.v4f32.i32
   ret void
 }
@@ -188,7 +188,7 @@ main_body:
 define amdgpu_ps void @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %mip) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.1d.sl_v4f32i32s.i32
   ret void
 }
@@ -196,7 +196,7 @@ main_body:
 define amdgpu_ps void @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.v4f32.i32
   ret void
 }
@@ -204,7 +204,7 @@ main_body:
 define amdgpu_ps void @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.sl_v4f32i32s.i32
   ret void
 }
@@ -212,7 +212,7 @@ main_body:
 define amdgpu_ps void @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call {<2 x float>, i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 7, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.sl_v2f32i32s.i32
   ret void
 }
@@ -220,7 +220,7 @@ main_body:
 define amdgpu_ps void @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
 main_body:
   %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 3, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2d.sl_f32i32s.i32
   ret void
 }
@@ -228,7 +228,7 @@ main_body:
 define amdgpu_ps void @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.3d.v4f32.i32
   ret void
 }
@@ -236,7 +236,7 @@ main_body:
 define amdgpu_ps void @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.cube.v4f32.i32
   ret void
 }
@@ -244,7 +244,7 @@ main_body:
 define amdgpu_ps void @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.1darray.v4f32.i32
   ret void
 }
@@ -252,7 +252,7 @@ main_body:
 define amdgpu_ps void @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
-; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
 ; CHECK-NEXT: @llvm.amdgcn.image.load.mip.2darray.v4f32.i32
   ret void
 }
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-dmask.ll
new file mode 100644
index 0000000000000..66137e88d5323
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-dmask.ll
@@ -0,0 +1,416 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, ptr addrspace(1) inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  ret <4 x float> %v.vec
+}
+
+define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 31, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 -1, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.3d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 31, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.cube.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 31, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1darray.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 31, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.2darray.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 -1, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 31, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 31, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.cl.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 31, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.cl.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 -1, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 31, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 31, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 31, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 31, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 -1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 31, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 31, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 31, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 -1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 31, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 31, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 31, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 -1, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 31, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 31, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 -1, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 31, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 31, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.l.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 -1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.l.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 31, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 31, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 -1, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.lz.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 31, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.lz.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 -1, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 31, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+main_body:
+  %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 3, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32
+  ret float %v
+}
+
+define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, ptr addrspace(1) inreg %out) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 3, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.o.2darray.sl_f32i32s.f32.f32
+  %v.vec = extractvalue {float, i32} %v, 0
+  ret float %v.vec
+}
+
+define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 7, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32
+  ret <2 x float> %v
+}
+
+define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+main_body:
+  %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+; CHECK: The llvm.amdgcn.image.[load|sample] intrinsic's dmask argument cannot have more active bits than there are elements in the return type
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.d.o.2darray.sl_v2f32i32s.f32.f32
+  %v.vec = extractvalue {<2 x float>, i32} %v, 0
+  ret <2 x float> %v.vec
+}
+
+define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_1d_glc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_1d_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 2)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+define amdgpu_ps <4 x float> @sample_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 3)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.v4f32.f32
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {float, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-noret-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-noret-dmask.ll
new file mode 100644
index 0000000000000..47393a7a98a27
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-sample-noret-dmask.ll
@@ -0,0 +1,147 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_ps void @sample_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  call void @llvm.amdgcn.image.sample.1d.nortn.f32(i32 31, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
+main_body:
+  call void @llvm.amdgcn.image.sample.2d.nortn.f32(i32 31, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.2d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_3d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) {
+main_body:
+  call void @llvm.amdgcn.image.sample.3d.nortn.f32(i32 31, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.3d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_cube_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
+main_body:
+  call void @llvm.amdgcn.image.sample.cube.nortn.f32(i32 31, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.cube.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_1darray_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) {
+main_body:
+  call void @llvm.amdgcn.image.sample.1darray.nortn.f32(i32 31, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.1darray.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_2darray_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
+main_body:
+  call void @llvm.amdgcn.image.sample.2darray.nortn.f32(i32 31, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.2darray.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_b_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+main_body:
+  call void @llvm.amdgcn.image.sample.b.1d.nortn.f32(i32 31, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.b.1d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_b_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+main_body:
+  call void @llvm.amdgcn.image.sample.b.2d.nortn.f32(i32 31, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.b.2d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_c_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s) {
+main_body:
+  call void @llvm.amdgcn.image.sample.c.1d.nortn.f32(i32 31, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.1d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_c_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
+main_body:
+  call void @llvm.amdgcn.image.sample.c.2d.nortn.f32(i32 31, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.c.2d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_d_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dsdv, float %s) {
+main_body:
+  call void @llvm.amdgcn.image.sample.d.1d.nortn.f32.f32(i32 31, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.1d.nortn.f32.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_d_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) {
+main_body:
+  call void @llvm.amdgcn.image.sample.d.2d.nortn.f32.f32(i32 31, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.2d.nortn.f32.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_l_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) {
+main_body:
+  call void @llvm.amdgcn.image.sample.l.1d.nortn.f32(i32 31, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.l.1d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_l_2d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
+main_body:
+  call void @llvm.amdgcn.image.sample.l.2d.nortn.f32(i32 31, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.l.2d.nortn.f32
+  ret void
+}
+
+define amdgpu_ps void @sample_d_1d_g16_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
+main_body:
+  call void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32 -1, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32
+  ret void
+}
+
+declare void @llvm.amdgcn.image.sample.1d.nortn.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.2d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.3d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.cube.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.1darray.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.2darray.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.b.1d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.b.2d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.c.1d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.c.2d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.d.1d.f32.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.d.2d.f32.nortn.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare void @llvm.amdgcn.image.sample.l.1d.nortn.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+declare void @llvm.amdgcn.image.sample.l.2d.nortn.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+
+declare void @llvm.amdgcn.image.sample.d.1d.nortn.f16.f32(i32, half, half, float, <8 x i32>, <4 x i32>, i1, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-store-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-store-dmask.ll
new file mode 100644
index 0000000000000..699bd097c7069
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-store-dmask.ll
@@ -0,0 +1,157 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.1d.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) {
+main_body:
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.2d.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) {
+main_body:
+  call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.3d.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.cube.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) {
+main_body:
+  call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.1darray.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.2darray.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.2dmsaa.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.mip.1d.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.mip.2d.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.mip.3d.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.mip.cube.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.mip.1darray.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+main_body:
+  call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.mip.2darray.v4f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.1d.f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.1d.v2f32.i32
+  ret void
+}
+
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) {
+main_body:
+  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+; CHECK: DMask is a 4 bit value and therefore at most 4 least significant bits of an llvm.amdgcn.image.* intrinsic's dmask argument can be set
+; CHECK-NEXT: @llvm.amdgcn.image.store.1d.v4f32.i32
+  ret void
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+
+declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1d.f32.i32(float, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float>, i32, i32, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }