[llvm] [AMDGPU] Fix image intrinsic optimizer on loads from different resources (PR #69355)

Tue Oct 17 09:23:31 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

The image intrinsic optimizer pass was neglecting to check any arguments
of the load intrinsic after the VAddr arguments. For example multiple
loads from different resources should not have been combined but were,
because the pass was not checking the resource argument.


---
Full diff: https://github.com/llvm/llvm-project/pull/69355.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp (+22-21) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll (+41) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index acfd3407681a7fd..d1c0a1703502517 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -108,28 +108,29 @@ void addInstToMergeableList(
     if (IIList.front()->getType() != II->getType())
       continue;
 
-    // Check DMask.
-    Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
-    Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
-    if (DMaskList != DMask)
-      continue;
-
-    // Check VAddr (except FragId).
-    int I = ImageDimIntr->VAddrStart;
-    for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
-      if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
-        break;
+    // Check all arguments (DMask, VAddr, RSrc etc).
+    bool AllEqual = true;
+    assert(IIList.front()->arg_size() == II->arg_size());
+    for (int I = 1, E = II->arg_size(); I != E; ++I) {
+      Value *ArgList = IIList.front()->getArgOperand(I);
+      Value *Arg = II->getArgOperand(I);
+      if (I == ImageDimIntr->VAddrEnd - 1) {
+        // Check FragId group.
+        auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
+        auto FragId = cast<ConstantInt>(II->getArgOperand(I));
+        if (FragIdList->getValue().udiv(4) != FragId->getValue().udiv(4)) {
+          AllEqual = false;
+          break;
+        }
+      } else {
+        // Check all arguments except FragId.
+        if (ArgList != Arg) {
+          AllEqual = false;
+          break;
+        }
+      }
     }
-
-    if (I != ImageDimIntr->VAddrEnd - 1)
-      continue;
-
-    // Check FragId group.
-    const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
-    Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
-    auto IIListFragId = cast<ConstantInt>(FragIdList);
-    auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
-    if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
+    if (!AllEqual)
       continue;
 
     // Add to the list.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
index 853ca53767be8cc..5ffdbb0f8c5b073 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
@@ -1184,6 +1184,47 @@ merge:
   ret [4 x float] %i25
 }
 
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32> inreg %rsrc1, <8 x i32> inreg %rsrc2, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT:  main_body:
+; NO-MSAA-NEXT:    [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT:    [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT:    ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
+; MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT:  main_body:
+; MSAA-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; MSAA-NEXT:    [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT:    [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; MSAA-NEXT:    [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT:    [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT:    [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT:    [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT:    [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT:    [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT:    ret [4 x float] [[I7]]
+;
+main_body:
+  %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0)
+  %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0)
+  %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0)
+  %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0)
+  %i4 = insertvalue [4 x float] undef, float %i, 0
+  %i5 = insertvalue [4 x float] %i4, float %i1, 1
+  %i6 = insertvalue [4 x float] %i5, float %i2, 2
+  %i7 = insertvalue [4 x float] %i6, float %i3, 3
+  ret [4 x float] %i7
+}
+
 declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
 declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0

``````````

</details>


https://github.com/llvm/llvm-project/pull/69355