[llvm] AMDGPU/UniformityAnalysis: fix G_ZEXTLOAD and G_SEXTLOAD (PR #157845)

Wed Sep 10 08:21:40 PDT 2025

https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/157845

>From 310a7b659ffedae12fadf446860e2221e617263f Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Wed, 10 Sep 2025 13:04:20 +0200
Subject: [PATCH] AMDGPU/UniformityAnalysis: fix G_ZEXTLOAD and G_SEXTLOAD

Use same rules for G_ZEXTLOAD and G_SEXTLOAD as for G_LOAD.
Flat addrspace(0) and private addrspace(5) G_ZEXTLOAD and G_SEXTLOAD
should be always divergent.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 15 +++++++-------
 .../AMDGPU/MIR/loads-gmir.mir                 | 20 +++++++++++--------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5c958dfe6954f..398c99b3bd127 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10281,7 +10281,7 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 InstructionUniformity
 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
-  unsigned opcode = MI.getOpcode();
+  unsigned Opcode = MI.getOpcode();
 
   auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
     Register Dst = MI.getOperand(0).getReg();
@@ -10301,7 +10301,7 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   // If the target supports globally addressable scratch, the mapping from
   // scratch memory to the flat aperture changes therefore an address space cast
   // is no longer uniform.
-  if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+  if (Opcode == TargetOpcode::G_ADDRSPACE_CAST)
     return HandleAddrSpaceCast(MI);
 
   if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
@@ -10329,7 +10329,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
   //
   // All other loads are not divergent, because if threads issue loads with the
   // same arguments, they will always get the same result.
-  if (opcode == AMDGPU::G_LOAD) {
+  if (Opcode == AMDGPU::G_LOAD || Opcode == AMDGPU::G_ZEXTLOAD ||
+      Opcode == AMDGPU::G_SEXTLOAD) {
     if (MI.memoperands_empty())
       return InstructionUniformity::NeverUniform; // conservative assumption
 
@@ -10343,10 +10344,10 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::Default;
   }
 
-  if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
-      opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
-      opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
-      AMDGPU::isGenericAtomic(opcode)) {
+  if (SIInstrInfo::isGenericAtomicRMWOpcode(Opcode) ||
+      Opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
+      Opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
+      AMDGPU::isGenericAtomic(Opcode)) {
     return InstructionUniformity::NeverUniform;
   }
   return InstructionUniformity::Default;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir
index cb3c2de5b8753..d799cd2057f47 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/loads-gmir.mir
@@ -46,13 +46,13 @@ body:             |
     %6:_(p5) = G_IMPLICIT_DEF
 
     ; Atomic load
-    ; CHECK-NOT: DIVERGENT
-
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ZEXTLOAD
     %0:_(s32) = G_ZEXTLOAD %1(p0) :: (load seq_cst (s16) from `ptr undef`)
 
     ; flat load
-    ; CHECK-NOT: DIVERGENT
-
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ZEXTLOAD
     %2:_(s32) = G_ZEXTLOAD %1(p0) :: (load (s16) from `ptr undef`)
 
     ; Gloabal load
@@ -60,7 +60,8 @@ body:             |
     %3:_(s32) = G_ZEXTLOAD %4(p1) :: (load (s16) from `ptr addrspace(1) undef`, addrspace 1)
 
     ; Private load
-    ; CHECK-NOT: DIVERGENT
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_ZEXTLOAD
     %5:_(s32) = G_ZEXTLOAD %6(p5) :: (volatile load (s16) from `ptr addrspace(5) undef`, addrspace 5)
     G_STORE %2(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
     G_STORE %3(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
@@ -80,11 +81,13 @@ body:             |
     %6:_(p5) = G_IMPLICIT_DEF
 
     ; Atomic load
-    ; CHECK-NOT: DIVERGENT
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_SEXTLOAD
     %0:_(s32) = G_SEXTLOAD %1(p0) :: (load seq_cst (s16) from `ptr undef`)
 
     ; flat load
-    ; CHECK-NOT: DIVERGENT
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_SEXTLOAD
     %2:_(s32) = G_SEXTLOAD %1(p0) :: (load (s16) from `ptr undef`)
 
     ; Gloabal load
@@ -92,7 +95,8 @@ body:             |
     %3:_(s32) = G_SEXTLOAD %4(p1) :: (load (s16) from `ptr addrspace(1) undef`, addrspace 1)
 
     ; Private load
-    ; CHECK-NOT: DIVERGENT
+    ; CHECK: DIVERGENT
+    ; CHECK-SAME: G_SEXTLOAD
     %5:_(s32) = G_SEXTLOAD %6(p5) :: (volatile load (s16) from `ptr addrspace(5) undef`, addrspace 5)
     G_STORE %2(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
     G_STORE %3(s32), %4(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)