[llvm] 68fab44 - AMDGPU: Fix visiting physreg dest users when folding immediate copies

Mon Aug 10 10:46:58 PDT 2020

Author: Matt Arsenault
Date: 2020-08-10T13:46:51-04:00
New Revision: 68fab44acfc7ce7fecd86ad784fb207f088c5366

URL: https://github.com/llvm/llvm-project/commit/68fab44acfc7ce7fecd86ad784fb207f088c5366
DIFF: https://github.com/llvm/llvm-project/commit/68fab44acfc7ce7fecd86ad784fb207f088c5366.diff

LOG: AMDGPU: Fix visiting physreg dest users when folding immediate copies

This can fold the immediate into the physical destination, but this
should not look for further users of the register. Fixes regression
introduced by 766cb615a3b96025192707f4670cdf171da84034.

Added: 
    llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 67b91e14fcca..3f1e980627d8 100644

--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -666,32 +666,34 @@ void SIFoldOperands::foldOperand(
       return;
 
     const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
-    if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
-      MachineRegisterInfo::use_iterator NextUse;
-      SmallVector<FoldCandidate, 4> CopyUses;
-      for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg),
-                                             E = MRI->use_end();
-           Use != E; Use = NextUse) {
-        NextUse = std::next(Use);
-        // There's no point trying to fold into an implicit operand.
-        if (Use->isImplicit())
-          continue;
-
-        FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
-                                         &UseMI->getOperand(1));
-        CopyUses.push_back(FC);
-      }
-      for (auto &F : CopyUses) {
-        foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
+    if (!DestReg.isPhysical()) {
+      if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
+        MachineRegisterInfo::use_iterator NextUse;
+        SmallVector<FoldCandidate, 4> CopyUses;
+        for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg),
+               E = MRI->use_end();
+             Use != E; Use = NextUse) {
+          NextUse = std::next(Use);
+          // There's no point trying to fold into an implicit operand.
+          if (Use->isImplicit())
+            continue;
+
+          FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
+                                           &UseMI->getOperand(1));
+          CopyUses.push_back(FC);
+        }
+        for (auto &F : CopyUses) {
+          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
+        }
       }
-    }
 
-    if (DestRC == &AMDGPU::AGPR_32RegClass &&
-        TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-      UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
-      UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
-      CopiesToReplace.push_back(UseMI);
-      return;
+      if (DestRC == &AMDGPU::AGPR_32RegClass &&
+          TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+        UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));
+        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        CopiesToReplace.push_back(UseMI);
+        return;
+      }
     }
 
     // In order to fold immediates into copies, we need to change the

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
index 9164e5e26791..e921248cc325 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
@@ -87,3 +87,26 @@ body:             |
     S_ENDPGM 0, implicit $vgpr0
 
 ...
+
+# The users of $vgpr1 should not be visited for further immediate
+# folding.
+
+# GCN-LABEL: name: no_fold_physreg_users_vgpr{{$}}
+# GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: S_NOP 0, implicit-def $vgpr1
+# GCN-NEXT: %2:vgpr_32 = COPY $vgpr1
+# GCN-NEXT: $vgpr2 = COPY %2
+---
+name: no_fold_physreg_users_vgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 0
+    %1:vgpr_32 = COPY %0
+    $vgpr1 = COPY %0
+    S_NOP 0, implicit-def $vgpr1
+    %2:vgpr_32 = COPY $vgpr1
+    $vgpr2 = COPY %2
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
new file mode 100644
index 000000000000..6995cf684555
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Make sure the return value of the first call is not overwritten with
+; a constant before the fadd use.
+
+; CHECK-LABEL: vgpr_multi_use_imm_fold:
+; CHECK: v_mov_b32_e32 v0, 0{{$}}
+; CHECK: v_mov_b32_e32 v1, 2.0{{$}}
+; CHECK:    s_swappc_b64
+; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0
+; CHECK:    s_swappc_b64
+define amdgpu_kernel void @vgpr_multi_use_imm_fold() {
+entry:
+  store double 0.0, double addrspace(1)* undef, align 8
+  %call0 = tail call fastcc double @__ocml_log_f64(double 2.0)
+  %op = fadd double %call0, 0.0
+  %call1 = tail call fastcc double @__ocml_sqrt_f64(double %op)
+  ret void
+}
+
+declare hidden fastcc double @__ocml_log_f64(double)
+declare hidden fastcc double @__ocml_sqrt_f64(double)