[PATCH] D23482: AMDGPU: Don't fold subregister extracts into tied operands

Fri Aug 12 18:27:53 PDT 2016

arsenm created this revision.
arsenm added reviewers: tstellarAMD, nhaehnle.
arsenm added a subscriber: llvm-commits.
Herald added subscribers: kzhuravl, arsenm.

Should fix problem found with D22556

https://reviews.llvm.org/D23482

Files:
  lib/Target/AMDGPU/SIFoldOperands.cpp
  test/CodeGen/AMDGPU/operand-folding.ll

Index: test/CodeGen/AMDGPU/operand-folding.ll
===================================================================

--- test/CodeGen/AMDGPU/operand-folding.ll
+++ test/CodeGen/AMDGPU/operand-folding.ll
@@ -109,6 +109,21 @@
   ret void
 }
 
+; A subregister use operand should not be tied.
+; CHECK-LABEL: {{^}}no_fold_tied_subregister:
+; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
+; CHECK: buffer_store_dword v[[LO]]
+define void @no_fold_tied_subregister() {
+  %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+  %tmp2 = extractelement <2 x float> %tmp1, i32 0
+  %tmp3 = extractelement <2 x float> %tmp1, i32 1
+  %tmp4 = fmul float %tmp3, 10.0
+  %tmp5 = fadd float %tmp4, %tmp2
+  store volatile float %tmp5, float addrspace(1)* undef
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }
Index: lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- lib/Target/AMDGPU/SIFoldOperands.cpp
+++ lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -197,9 +197,21 @@
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
 
   // FIXME: Fold operands with subregs.
-  if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) ||
-      UseOp.isImplicit())) {
-    return;
+  if (UseOp.isReg() && OpToFold.isReg()) {
+    if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
+      return;
+
+    // Don't fold subregister extracts into tied operands, only if it is a full
+    // copy since a subregister use tied to a full register def doesn't really
+    // make sense. e.g. don't fold:
+    //
+    // %vreg1 = COPY %vreg0:sub1
+    // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg1<tied0>
+    //
+    //  into
+    // %vreg2<tied3> = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1<tied0>
+    if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
+      return;
   }
 
   bool FoldingImm = OpToFold.isImm();


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D23482.67951.patch
Type: text/x-patch
Size: 2036 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160813/4ecca69f/attachment.bin>