[llvm] r328856 - [AMDGPU] Fix the SDWA Peephole phase to handle src for dst:UNUSED_PRESERVE.

Michael Bedy via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 29 22:03:36 PDT 2018


Author: mjbedy
Date: Thu Mar 29 22:03:36 2018
New Revision: 328856

URL: http://llvm.org/viewvc/llvm-project?rev=328856&view=rev
Log:
[AMDGPU] Fix the SDWA Peephole phase to handle src for dst:UNUSED_PRESERVE.

Summary:
The phase attempts to transform operations that extract a portion of a value
into an SDWA src operand in cases where that value is used only once. It
was not prepared for this use to be the preserved portion of a value for
dst:UNUSED_PRESERVE, resulting in a crash or assert.

This change either rejects the illegal SDWA attempt, or in the case where
dst:WORD_1 and the src_sel would be WORD_0, removes the unneeded
extract instruction.

Reviewers: arsenm, #amdgpu

Reviewed By: arsenm, #amdgpu

Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D44364

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
    llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir

Modified: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp?rev=328856&r1=328855&r2=328856&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp Thu Mar 29 22:03:36 2018
@@ -366,18 +366,53 @@ MachineInstr *SDWASrcOperand::potentialT
 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   // Find operand in instruction that matches source operand and replace it with
   // target operand. Set corresponding src_sel
-
+  bool IsPreserveSrc = false;
   MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
   MachineOperand *SrcMods =
       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   assert(Src && (Src->isReg() || Src->isImm()));
   if (!isSameReg(*Src, *getReplacedOperand())) {
-    // If this is not src0 then it should be src1
+    // If this is not src0 then it could be src1
     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
     SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
 
+    if (!Src ||
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // It's possible this Src is a tied operand for
+      // UNUSED_PRESERVE, in which case we can either
+      // abandon the peephole attempt, or if legal we can
+      // copy the target operand into the tied slot
+      // if the preserve operation will effectively cause the same
+      // result by overwriting the rest of the dst.
+      MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      MachineOperand *DstUnused =
+        TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+
+      if (Dst &&
+          DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+        // This will work if the tied src is acessing WORD_0, and the dst is
+        // writing WORD_1. Modifiers don't matter because all the bits that
+        // would be impacted are being overwritten by the dst.
+        // Any other case will not work.
+        SdwaSel DstSel = static_cast<SdwaSel>(
+            TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
+        if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
+            getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
+          IsPreserveSrc = true;
+          auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                   AMDGPU::OpName::vdst);
+          auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
+          Src = &MI.getOperand(TiedIdx);
+          SrcSel = nullptr;
+          SrcMods = nullptr;
+        } else {
+          // Not legal to convert this src
+          return false;
+        }
+      }
+    }
     assert(Src && Src->isReg());
 
     if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
@@ -388,11 +423,14 @@ bool SDWASrcOperand::convertToSDWA(Machi
       return false;
     }
 
-    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+    assert(isSameReg(*Src, *getReplacedOperand()) &&
+           (IsPreserveSrc || (SrcSel && SrcMods)));
   }
   copyRegOperand(*Src, *getTargetOperand());
-  SrcSel->setImm(getSrcSel());
-  SrcMods->setImm(getSrcMods(TII, Src));
+  if (!IsPreserveSrc) {
+    SrcSel->setImm(getSrcSel());
+    SrcMods->setImm(getSrcMods(TII, Src));
+  }
   getTargetOperand()->setIsKill(false);
   return true;
 }
@@ -857,6 +895,9 @@ bool SIPeepholeSDWA::isConvertibleToSDWA
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
                                    const SDWAOperandsVector &SDWAOperands) {
+
+  DEBUG(dbgs() << "Convert instruction:" << MI);
+
   // Convert to sdwa
   int SDWAOpcode;
   unsigned Opcode = MI.getOpcode();
@@ -982,9 +1023,29 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
     }
   }
 
+  // Check for a preserved register that needs to be copied.
+  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  if (DstUnused &&
+      DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
+    // We expect, if we are here, that the instruction was already in it's SDWA form,
+    // with a tied operand.
+    assert(Dst && Dst->isTied());
+    assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
+    // We also expect a vdst, since sdst can't preserve.
+    auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
+    assert(PreserveDstIdx != -1);
+
+    auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
+    auto Tied = MI.getOperand(TiedIdx);
+
+    SDWAInst.add(Tied);
+    SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
+  }
+
   // Apply all sdwa operand patterns.
   bool Converted = false;
   for (auto &Operand : SDWAOperands) {
+    DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
     // There should be no intesection between SDWA operands and potential MIs
     // e.g.:
     // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
@@ -1005,8 +1066,7 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
     return false;
   }
 
-  DEBUG(dbgs() << "Convert instruction:" << MI
-               << "Into:" << *SDWAInst << '\n');
+  DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
   ++NumSDWAInstructionsPeepholed;
 
   MI.eraseFromParent();

Modified: llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir?rev=328856&r1=328855&r2=328856&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-preserve.mir Thu Mar 29 22:03:36 2018
@@ -54,3 +54,91 @@ body:             |
     FLAT_STORE_DWORD %0, %13, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
     $sgpr30_sgpr31 = COPY %2
     S_SETPC_B64_return $sgpr30_sgpr31
+
+---
+# SDWA-LABEL: sdwa_preserve_keep
+# SDWA: flat_load_dword [[FIRST:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
+# SDWA: flat_load_dword [[SECOND:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
+
+# SDWA: v_and_b32_e32 [[AND:v[0-9]+]], 0xff, [[FIRST]]
+# SDWA: v_mov_b32_sdwa [[AND]], [[SECOND]] dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+
+# SDWA: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], [[AND]]
+
+name:            sdwa_preserve_keep
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+    %2 = COPY $sgpr30_sgpr31
+    %1 = COPY $vgpr2_vgpr3
+    %0 = COPY $vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+
+    %9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec
+    %10:sreg_32_xm0 = S_MOV_B32 255
+    %11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec
+    %17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0)
+    FLAT_STORE_DWORD %0, %17, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    S_ENDPGM
+
+...
+---
+# SDWA-LABEL: sdwa_preserve_remove
+# SDWA: flat_load_dword [[FIRST:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
+# SDWA: flat_load_dword [[SECOND:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]
+
+# SDWA: v_mov_b32_sdwa [[FIRST]], [[SECOND]] dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+
+# SDWA: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], [[FIRST]]
+
+name:            sdwa_preserve_remove
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+    %2 = COPY $sgpr30_sgpr31
+    %1 = COPY $vgpr2_vgpr3
+    %0 = COPY $vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+    %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+
+    %9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec
+    %10:sreg_32_xm0 = S_MOV_B32 65535
+    %11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec
+    %17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0)
+    FLAT_STORE_DWORD %0, %17, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+    S_ENDPGM
+
+...




More information about the llvm-commits mailing list