[llvm] [AMDGPU] Fix crash in InstCombine (PR #179511)

Thu Feb 5 01:06:08 PST 2026

https://github.com/AlexeySachkov updated https://github.com/llvm/llvm-project/pull/179511

>From 4539640c220c51f1ec4ce687dafa4029c9b7caf9 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Tue, 3 Feb 2026 11:16:58 -0600
Subject: [PATCH 1/3] [AMDGPU] Fix crash in InstCombine

Added an out-of-bounds check to avoid crashes when simplifying
`@llvm.amdgcn.image.load.*` intrinsics that return vector types and the
mask argument "enables" more elements than there are in the return type.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |   2 +
 .../fix-amdgcn-image-load-dmask-crash.ll      | 123 ++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2cd1902785546..0756a3c257738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1861,6 +1861,8 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
       const unsigned Bit = 1 << SrcIdx;
       if (!!(DMaskVal & Bit)) {
+        if (OrigLdStIdx >= DemandedElts.getBitWidth())
+          break;
         if (!!DemandedElts[OrigLdStIdx])
           NewDMaskVal |= Bit;
         OrigLdStIdx++;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
new file mode 100644
index 0000000000000..571b9e4f123b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple amdgcn -passes=instcombine %s -S -o - | FileCheck %s
+;
+; The main purpose of the test is to ensure that we do not crash when the mask
+; argument "enables" more elements than there are in return type.
+; This specific corner case was discovered by a fuzzer.
+
+define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; CHECK-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; CHECK-NEXT:    ret [4 x <2 x float>] [[I7]]
+;
+main_body:
+  %i = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 7, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+  %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+  %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+  ret [4 x <2 x float>] %i7
+}
+
+define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+; CHECK-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 11, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 -11, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[I3]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[TMP0]], 3
+; CHECK-NEXT:    ret [4 x <3 x float>] [[I7]]
+;
+main_body:
+  %i = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 11, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 -11, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
+  %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
+  %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
+  ret [4 x <3 x float>] %i7
+}
+
+define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
+; CHECK-NEXT:    ret [4 x <2 x float>] [[I7]]
+;
+main_body:
+  %i = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 7, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
+  %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
+  %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
+  ret [4 x <2 x float>] %i7
+}
+
+define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+; CHECK-LABEL: define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(
+; CHECK-SAME: <8 x i32> inreg [[RSRC:%.*]], i32 [[S:%.*]], i32 [[T:%.*]], i32 [[SLICE:%.*]]) {
+; CHECK-NEXT:  [[MAIN_BODY:.*:]]
+; CHECK-NEXT:    [[I:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 11, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 0, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
+; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
+; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
+; CHECK-NEXT:    ret [4 x <3 x float>] [[I7]]
+;
+main_body:
+  %i = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+  %i1 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+  %i2 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+  %i3 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
+  %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
+  %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
+  ret [4 x <3 x float>] %i7
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

>From 93e5ccdadd7133acb94772d2ba1ca9afa5c19208 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Wed, 4 Feb 2026 05:16:42 -0600
Subject: [PATCH 2/3] Apply comments

---
 .../fix-amdgcn-image-load-dmask-crash.ll      | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)
 rename llvm/test/{CodeGen => Transforms/InstCombine}/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll (91%)

diff --git a/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
similarity index 91%
rename from llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
rename to llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
index 571b9e4f123b7..c81f1fe7cde9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
@@ -13,7 +13,7 @@ define amdgpu_ps [4 x <2 x float>] @load_2dmsaa_v4v2f32_dmask(<8 x i32> inreg %r
 ; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
@@ -24,7 +24,7 @@ main_body:
   %i1 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i4 = insertvalue [4 x <2 x float>] poison, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
   %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
@@ -40,7 +40,7 @@ define amdgpu_ps [4 x <3 x float>] @load_2dmsaa_v4v3f32_dmask(<8 x i32> inreg %r
 ; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32.v8i32(i32 -11, i32 [[S]], i32 [[T]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x float> [[I3]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] poison, <3 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[TMP0]], 3
@@ -51,7 +51,7 @@ main_body:
   %i1 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 -11, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i4 = insertvalue [4 x <3 x float>] poison, <3 x float> %i, 0
   %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
   %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
   %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
@@ -66,7 +66,7 @@ define amdgpu_ps [4 x <2 x float>] @load_2darraymsaa_v4v2f32_dmask(<8 x i32> inr
 ; CHECK-NEXT:    [[I1:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I2:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32.v8i32(i32 3, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] undef, <2 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <2 x float>] [[I4]], <2 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <2 x float>] [[I5]], <2 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <2 x float>] [[I6]], <2 x float> [[I3]], 3
@@ -77,7 +77,7 @@ main_body:
   %i1 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 11, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <2 x float>] undef, <2 x float> %i, 0
+  %i4 = insertvalue [4 x <2 x float>] poison, <2 x float> %i, 0
   %i5 = insertvalue [4 x <2 x float>] %i4, <2 x float> %i1, 1
   %i6 = insertvalue [4 x <2 x float>] %i5, <2 x float> %i2, 2
   %i7 = insertvalue [4 x <2 x float>] %i6, <2 x float> %i3, 3
@@ -92,7 +92,7 @@ define amdgpu_ps [4 x <3 x float>] @load_2darraymsaa_v4v3f32_dmask(<8 x i32> inr
 ; CHECK-NEXT:    [[I1:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 1, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I2:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 2, <8 x i32> [[RSRC]], i32 0, i32 0)
 ; CHECK-NEXT:    [[I3:%.*]] = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32.v8i32(i32 7, i32 [[S]], i32 [[T]], i32 [[SLICE]], i32 3, <8 x i32> [[RSRC]], i32 0, i32 0)
-; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] undef, <3 x float> [[I]], 0
+; CHECK-NEXT:    [[I4:%.*]] = insertvalue [4 x <3 x float>] poison, <3 x float> [[I]], 0
 ; CHECK-NEXT:    [[I5:%.*]] = insertvalue [4 x <3 x float>] [[I4]], <3 x float> [[I1]], 1
 ; CHECK-NEXT:    [[I6:%.*]] = insertvalue [4 x <3 x float>] [[I5]], <3 x float> [[I2]], 2
 ; CHECK-NEXT:    [[I7:%.*]] = insertvalue [4 x <3 x float>] [[I6]], <3 x float> [[I3]], 3
@@ -103,21 +103,17 @@ main_body:
   %i1 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
   %i2 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
   %i3 = call <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 -1, i32 %s, i32 %t, i32 %slice, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
-  %i4 = insertvalue [4 x <3 x float>] undef, <3 x float> %i, 0
+  %i4 = insertvalue [4 x <3 x float>] poison, <3 x float> %i, 0
   %i5 = insertvalue [4 x <3 x float>] %i4, <3 x float> %i1, 1
   %i6 = insertvalue [4 x <3 x float>] %i5, <3 x float> %i2, 2
   %i7 = insertvalue [4 x <3 x float>] %i6, <3 x float> %i3, 3
   ret [4 x <3 x float>] %i7
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <2 x float> @llvm.amdgcn.image.load.2darraymsaa.v2f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare <3 x float> @llvm.amdgcn.image.load.2darraymsaa.v3f32.i32(i32 immarg, i32, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) #0

>From 6a3f4d59505e551b746ed1624ab9133f010d9c2d Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov at amd.com>
Date: Thu, 5 Feb 2026 03:05:14 -0600
Subject: [PATCH 3/3] Catch the corner case by verifier

---
 llvm/lib/IR/Verifier.cpp                      |  33 ++
 .../fix-amdgcn-image-load-dmask-crash.ll      |   4 +-
 .../intrinsic-amdgcn-image-load-dmask.ll      | 307 ++++++++++++++++++
 3 files changed, 343 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3d44d1317ecc7..3e62d9525dacc 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -7147,6 +7147,39 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           &Call, Op);
     break;
   }
+  case Intrinsic::amdgcn_image_load_1d:
+  case Intrinsic::amdgcn_image_load_1darray:
+  case Intrinsic::amdgcn_image_load_2d:
+  case Intrinsic::amdgcn_image_load_2darray:
+  case Intrinsic::amdgcn_image_load_2darraymsaa:
+  case Intrinsic::amdgcn_image_load_2dmsaa:
+  case Intrinsic::amdgcn_image_load_3d:
+  case Intrinsic::amdgcn_image_load_cube:
+  case Intrinsic::amdgcn_image_load_mip_1d:
+  case Intrinsic::amdgcn_image_load_mip_1darray:
+  case Intrinsic::amdgcn_image_load_mip_2d:
+  case Intrinsic::amdgcn_image_load_mip_2darray:
+  case Intrinsic::amdgcn_image_load_mip_3d:
+  case Intrinsic::amdgcn_image_load_mip_cube: {
+    // LLVM IR definition of those intrinsics says that they can return any
+    // type. The logic below is based on what is covered by the
+    // llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll test.
+    Type *T = Call.getType();
+    if (auto *ST = dyn_cast<StructType>(Call.getType()))
+      T = ST->getElementType(0);
+
+    unsigned VWidth = 1;
+    if (auto *FVT = dyn_cast<FixedVectorType>(T))
+      VWidth = FVT->getNumElements();
+
+    Value *V = Call.getArgOperand(0);
+    unsigned DMask = cast<ConstantInt>(V)->getZExtValue();
+    unsigned NumActiveBits = popcount(DMask);
+    Check(NumActiveBits <= VWidth,
+          "llvm.amdgcn.image.load.* intrinsic mask cannot have more active "
+          "bits than there are elements in the return type");
+    break;
+  }
   case Intrinsic::nvvm_setmaxnreg_inc_sync_aligned_u32:
   case Intrinsic::nvvm_setmaxnreg_dec_sync_aligned_u32: {
     Value *V = Call.getArgOperand(0);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
index c81f1fe7cde9a..7f16d69623eed 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/fix-amdgcn-image-load-dmask-crash.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple amdgcn -passes=instcombine %s -S -o - | FileCheck %s
+; RUN: opt -mtriple amdgcn -passes=instcombine --disable-verify %s -S -o - | FileCheck %s
+;
+; Verifier is disabled on purpose, because the IR is deemed invalid.
 ;
 ; The main purpose of the test is to ensure that we do not crash when the mask
 ; argument "enables" more elements than there are in return type.
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
new file mode 100644
index 0000000000000..1cb6fe420f320
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-amdgcn-image-load-dmask.ll
@@ -0,0 +1,307 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_ps void @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 -1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 31, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 31, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V2_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V1_tfe(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 31, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 7, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 3, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_tfe_V4(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_tfe_V2(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 -1, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+
+define amdgpu_ps void @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 31, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 31, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 31, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+define amdgpu_ps void @image_load_mmo(<8 x i32> inreg %rsrc, ptr addrspace(3) %lds, <2 x i32> %c) #0 {
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 3, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+; CHECK-NEXT: llvm.amdgcn.image.load.* intrinsic mask cannot have more active bits than there are elements in the return type
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+
+declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare float @llvm.amdgcn.image.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }