[PATH][CodeGen] Fix CombineToPostIndexedLoadStore in DAGCombiner.cpp

Fri Apr 3 06:13:21 PDT 2015

Hi,

    I post a patch for a missed optimization in the DAGCombiner.cpp LLVM 
file for the selection of Post Indexed Load and Store operations.

    The code in the function CombineToPostIndexedLoadStore that checks 
the uses of an ADD/SUB operation does not correctly check the real uses.

    Also, in the function canFoldInAddressingMode, VT is computed as the 
type of the destination/source of a LOAD/STORE operations, instead of 
the memory type of the operation. On targets which have a scaling factor 
on the offset of the LOAD/STORE operations, the function may return 
false for actually valid cases.

    I could reproduce this problem on the ARM target, a test case that 
exposes this problem is attached to this mail.

   Thanks to review it and commit it when it is OK.

    Regards,

     - François.
-------------- next part --------------
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================

--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 233547)
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(working copy)
@@ -8560,11 +8560,11 @@
   if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(Use)) {
     if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
       return false;
-    VT = Use->getValueType(0);
+    VT = LD->getMemoryVT();
   } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(Use)) {
     if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
       return false;
-    VT = ST->getValue().getValueType();
+    VT = ST->getMemoryVT();
   } else
     return false;
 
@@ -8852,8 +8852,7 @@
     return false;
 
   for (SDNode *Op : Ptr.getNode()->uses()) {
-    if (Op == N ||
-        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
+    if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
       continue;
 
     SDValue BasePtr;
@@ -8876,28 +8875,18 @@
         continue;
 
       // Check for #1.
-      bool TryNext = false;
-      for (SDNode *Use : BasePtr.getNode()->uses()) {
-        if (Use == Ptr.getNode())
-          continue;
-
-        // If all the uses are load / store addresses, then don't do the
-        // transformation.
-        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
-          bool RealUse = false;
-          for (SDNode *UseUse : Use->uses()) {
-            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
-              RealUse = true;
-          }
-
-          if (!RealUse) {
-            TryNext = true;
-            break;
-          }
+      // Look for a RealUse, i.e. one use that is not a load / store op, or one
+      // that cannot be folded as addressing mode
+      // If one use is not a load / store address, then do the transformation.
+      bool RealUse = false;
+      for (SDNode *OpUse : Op->uses()) {
+        if (!canFoldInAddressingMode(Op, OpUse, DAG, TLI)) {
+          RealUse = true;
+          break;
         }
       }
 
-      if (TryNext)
+      if (!RealUse)
         continue;
 
       // Check for #2
-------------- next part --------------
; Test that checks that automod addressing mode is selected
;
; llc -march=arm automod_test.ll
;
; ======================================================
; Without the fix, the generated code is the following :
;
;	ldrh	r3, [r2, #2]
;	strh	r3, [r1, #-2]
;	ldrh	r3, [r2]
;	sub	r2, r2, #6
;	strh	r3, [r1]
;	ldr	r3, [r0], #48
;	add	r1, r1, #6
;	cmp	r3, #0
;	bne	.LBB0_1
;
; With the patch, post modifying addressing modes are selected :
;
;	ldrh	r3, [r2, #2]
;	strh	r3, [r1, #-2]
;	ldrh	r3, [r2], #-6
;	strh	r3, [r1], #6
;	ldr	r3, [r0], #48
;	cmp	r3, #0
;	bne	.LBB0_1
; ======================================================

@input_tab64 = common global [32 x i16] zeroinitializer, align 2
@output_tab64 = common global [32 x i16] zeroinitializer, align 2

; Function Attrs: nounwind
define void @compute(i32* nocapture readonly %IDX) #0 {
entry:

  %0 = load i32* %IDX, align 4
  %tobool14 = icmp eq i32 %0, 0
  br i1 %tobool14, label %for.end, label %for.body

for.body:                                         ; preds = %entry, %for.body
  %i.015 = phi i32 [ %add8, %for.body ], [ 0, %entry ]
  %sub = sub nsw i32 32, %i.015
  %sub1 = add nsw i32 %sub, -1
  %arrayidx2 = getelementptr inbounds [32 x i16]* @input_tab64, i32 0, i32 %sub1
  %1 = load i16* %arrayidx2, align 2
  %arrayidx3 = getelementptr inbounds [32 x i16]* @output_tab64, i32 0, i32 %i.015
  store i16 %1, i16* %arrayidx3, align 2
  %sub5 = add nsw i32 %sub, -2
  %arrayidx6 = getelementptr inbounds [32 x i16]* @input_tab64, i32 0, i32 %sub5
  %2 = load i16* %arrayidx6, align 2
  %add = add nsw i32 %i.015, 1
  %arrayidx7 = getelementptr inbounds [32 x i16]* @output_tab64, i32 0, i32 %add
  store i16 %2, i16* %arrayidx7, align 2
  %add8 = add nsw i32 %i.015, 3
  %shl = shl i32 %add8, 2
  %arrayidx = getelementptr inbounds i32* %IDX, i32 %shl
  %3 = load i32* %arrayidx, align 4
  %tobool = icmp eq i32 %3, 0
  br i1 %tobool, label %for.end, label %for.body

for.end:                                          ; preds = %for.body, %entry
  ret void
}