[llvm] [AMDGPU] Fix SDWA 'preserve' transformation for instructions in different basic blocks. (PR #82406)

Thu Feb 22 09:15:26 PST 2024

https://github.com/vpykhtin updated https://github.com/llvm/llvm-project/pull/82406

>From 95d056331ed0a0d655afaae124822fe4b0a05490 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Tue, 20 Feb 2024 17:43:42 +0100
Subject: [PATCH 1/3] [AMDGPU] Prevent SDWA preserve transformation for
 instructions in different basic blocks.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp  |  6 +-
 llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 67 ++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 53fc2c0686245f..739ee17464edec 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -739,6 +739,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineInstr *SDWAInst = OrSDWADef->getParent();
     MachineInstr *OtherInst = OrOtherDef->getParent();
 
+    // Instruction and operand sources should reside in the same BB.
+    if (SDWAInst->getParent() != MI.getParent() ||
+        OtherInst->getParent() != MI.getParent())
+      break;
+
     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
     // destination patterns don't overlap. Compatible instruction can be either
     // regular instruction with compatible bitness or SDWA instruction with
@@ -815,7 +820,6 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     return std::make_unique<SDWADstPreserveOperand>(
       OrDst, OrSDWADef, OrOtherDef, DstSel);
-
   }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index f93456ccacb806..359945ff799434 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -160,3 +160,70 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            add_f16_u32_preserve_different_bb
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+  - { id: 8, class: vgpr_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+body:             |
+  ; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
+  ; SDWA: bb.0:
+  ; SDWA-NEXT:   successors: %bb.1(0x80000000)
+  ; SDWA-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+  ; SDWA-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; SDWA-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 65535, [[FLAT_LOAD_DWORD]], implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT:   [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
+  ; SDWA-NEXT:   [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.1:
+  ; SDWA-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec
+  ; SDWA-NEXT:   FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+  ; SDWA-NEXT:   $sgpr30_sgpr31 = COPY [[COPY]]
+  ; SDWA-NEXT:   S_SETPC_B64_return $sgpr30_sgpr31
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+    %2 = COPY $sgpr30_sgpr31
+    %1 = COPY $vgpr2_vgpr3
+    %0 = COPY $vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %4 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+
+    %5 = V_AND_B32_e32 65535, %3, implicit $exec
+    %6 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+    %7 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+    %8 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+
+    %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+    %10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+    %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+    %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+
+  bb.1:
+    %13 = V_OR_B32_e64 %10, %12, implicit $exec
+
+    FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    $sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return $sgpr30_sgpr31
+...

>From ac500352e8dfe5c1d1b50225da6d9b3307de8015 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Thu, 22 Feb 2024 17:51:44 +0100
Subject: [PATCH 2/3] fix SDWADstPreserveOperand::convertToSDWA to support
 'preserve' transform across different basic blocks

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp  | 12 +++---------
 llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 739ee17464edec..8d9053a9df1c00 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -472,12 +472,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
   }
 
   // Move MI before v_or_b32
-  auto MBB = MI.getParent();
-  MBB->remove(&MI);
-  MBB->insert(getParentInst(), &MI);
+  MI.getParent()->remove(&MI);
+  getParentInst()->getParent()->insert(getParentInst(), &MI);
 
   // Add Implicit use of preserved register
-  MachineInstrBuilder MIB(*MBB->getParent(), MI);
+  MachineInstrBuilder MIB(*MI.getMF(), MI);
   MIB.addReg(getPreservedOperand()->getReg(),
              RegState::ImplicitKill,
              getPreservedOperand()->getSubReg());
@@ -739,11 +738,6 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
     MachineInstr *SDWAInst = OrSDWADef->getParent();
     MachineInstr *OtherInst = OrOtherDef->getParent();
 
-    // Instruction and operand sources should reside in the same BB.
-    if (SDWAInst->getParent() != MI.getParent() ||
-        OtherInst->getParent() != MI.getParent())
-      break;
-
     // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
     // destination patterns don't overlap. Compatible instruction can be either
     // regular instruction with compatible bitness or SDWA instruction with
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 359945ff799434..7746afc3e6b264 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -193,12 +193,15 @@ body:             |
   ; SDWA-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
   ; SDWA-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
   ; SDWA-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
-  ; SDWA-NEXT:   [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 0, 4, 5, implicit $mode, implicit $exec
-  ; SDWA-NEXT:   [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
   ; SDWA-NEXT: {{  $}}
   ; SDWA-NEXT: bb.1:
-  ; SDWA-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F16_sdwa]], [[V_MUL_F32_sdwa]], implicit $exec
-  ; SDWA-NEXT:   FLAT_STORE_DWORD [[COPY2]], [[V_OR_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+  ; SDWA-NEXT:   successors: %bb.2(0x80000000)
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.2:
+  ; SDWA-NEXT:   [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+  ; SDWA-NEXT:   FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
   ; SDWA-NEXT:   $sgpr30_sgpr31 = COPY [[COPY]]
   ; SDWA-NEXT:   S_SETPC_B64_return $sgpr30_sgpr31
   bb.0:
@@ -217,10 +220,12 @@ body:             |
 
     %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
     %10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+
+  bb.1:
     %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
     %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
 
-  bb.1:
+  bb.2:
     %13 = V_OR_B32_e64 %10, %12, implicit $exec
 
     FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))

>From b196f5647e0bda116ecddcd73c40000a9478df53 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin at gmail.com>
Date: Thu, 22 Feb 2024 18:15:17 +0100
Subject: [PATCH 3/3] remove registers section, restore unrelated space change.

---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp  |  1 +
 llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 43 +++++++---------------
 2 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 8d9053a9df1c00..afc380b4203457 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -814,6 +814,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     return std::make_unique<SDWADstPreserveOperand>(
       OrDst, OrSDWADef, OrOtherDef, DstSel);
+
   }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index 7746afc3e6b264..4c61e6803febf3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -163,21 +163,6 @@ body:             |
 ---
 name:            add_f16_u32_preserve_different_bb
 tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_64 }
-  - { id: 1, class: vreg_64 }
-  - { id: 2, class: sreg_64 }
-  - { id: 3, class: vgpr_32 }
-  - { id: 4, class: vgpr_32 }
-  - { id: 5, class: vgpr_32 }
-  - { id: 6, class: vgpr_32 }
-  - { id: 7, class: vgpr_32 }
-  - { id: 8, class: vgpr_32 }
-  - { id: 9, class: vgpr_32 }
-  - { id: 10, class: vgpr_32 }
-  - { id: 11, class: vgpr_32 }
-  - { id: 12, class: vgpr_32 }
-  - { id: 13, class: vgpr_32 }
 body:             |
   ; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
   ; SDWA: bb.0:
@@ -207,26 +192,26 @@ body:             |
   bb.0:
     liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
 
-    %2 = COPY $sgpr30_sgpr31
-    %1 = COPY $vgpr2_vgpr3
-    %0 = COPY $vgpr0_vgpr1
-    %3 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
-    %4 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %2:sreg_64 = COPY $sgpr30_sgpr31
+    %1:vreg_64 = COPY $vgpr2_vgpr3
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %4:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
 
-    %5 = V_AND_B32_e32 65535, %3, implicit $exec
-    %6 = V_LSHRREV_B32_e64 16, %4, implicit $exec
-    %7 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
-    %8 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+    %5:vgpr_32 = V_AND_B32_e32 65535, %3, implicit $exec
+    %6:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+    %7:vgpr_32 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+    %8:vgpr_32 = V_LSHRREV_B32_e32 24, %4, implicit $exec
 
-    %9 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
-    %10 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+    %9:vgpr_32 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+    %10:vgpr_32 = V_LSHLREV_B16_e64 8, %9, implicit $exec
 
   bb.1:
-    %11 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
-    %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+    %11:vgpr_32 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = V_LSHLREV_B32_e64 16, %11, implicit $exec
 
   bb.2:
-    %13 = V_OR_B32_e64 %10, %12, implicit $exec
+    %13:vgpr_32 = V_OR_B32_e64 %10, %12, implicit $exec
 
     FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
     $sgpr30_sgpr31 = COPY %2