[llvm] r203230 - CodeGenPrep: sink extends of illegal types into use block.

Fri Mar 7 03:04:31 PST 2014

Author: tnorthover
Date: Fri Mar  7 05:04:30 2014
New Revision: 203230

URL: http://llvm.org/viewvc/llvm-project?rev=203230&view=rev
Log:
CodeGenPrep: sink extends of illegal types into use block.

This helps the instruction selector to lower an i64 * i64 -> i128
multiplication into a single instruction on targets which support it.

Patch by Manuel Jacob.

Added:
    llvm/trunk/test/CodeGen/X86/mul128_sext_loop.ll
    llvm/trunk/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll
Modified:
    llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
    llvm/trunk/lib/Target/X86/README.txt

Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=203230&r1=203229&r2=203230&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Fri Mar  7 05:04:30 2014
@@ -129,6 +129,7 @@ typedef DenseMap<Instruction *, Type *>
     bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
     bool OptimizeInlineAsmInst(CallInst *CS);
     bool OptimizeCallInst(CallInst *CI);
+    bool SinkExtExpand(CastInst *I);
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
@@ -465,40 +466,8 @@ void CodeGenPrepare::EliminateMostlyEmpt
   DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 }
 
-/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
-/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
-/// sink it into user blocks to reduce the number of virtual
-/// registers that must be created and coalesced.
-///
-/// Return true if any changes are made.
-///
-static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
-  // If this is a noop copy,
-  EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(CI->getType());
-
-  // This is an fp<->int conversion?
-  if (SrcVT.isInteger() != DstVT.isInteger())
-    return false;
-
-  // If this is an extension, it will be a zero or sign extension, which
-  // isn't a noop.
-  if (SrcVT.bitsLT(DstVT)) return false;
-
-  // If these values will be promoted, find out what they will be promoted
-  // to.  This helps us consider truncates on PPC as noop copies when they
-  // are.
-  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
-      TargetLowering::TypePromoteInteger)
-    SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
-  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
-      TargetLowering::TypePromoteInteger)
-    DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
-
-  // If, after promotion, these are the same types, this is a noop copy.
-  if (SrcVT != DstVT)
-    return false;
-
+/// SinkCast - Sink the specified cast instruction into its user blocks
+static bool SinkCast(CastInst *CI) {
   BasicBlock *DefBB = CI->getParent();
 
   /// InsertedCasts - Only insert a cast in each block once.
@@ -548,6 +517,43 @@ static bool OptimizeNoopCopyExpression(C
   return MadeChange;
 }
 
+/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
+/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
+/// sink it into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+///
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+  // If this is a noop copy,
+  EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(CI->getType());
+
+  // This is an fp<->int conversion?
+  if (SrcVT.isInteger() != DstVT.isInteger())
+    return false;
+
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT.bitsLT(DstVT)) return false;
+
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
+      TargetLowering::TypePromoteInteger)
+    SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
+  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
+      TargetLowering::TypePromoteInteger)
+    DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
+
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+
+  return SinkCast(CI);
+}
+
 /// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
 /// the number of virtual registers that must be created and coalesced.  This is
 /// a clear win except on targets with multiple condition code registers
@@ -2522,6 +2528,16 @@ bool CodeGenPrepare::OptimizeInlineAsmIn
   return MadeChange;
 }
 
+/// SinkExtExpand - Sink a zext or sext into its user blocks if the target type
+/// doesn't fit in one register
+bool CodeGenPrepare::SinkExtExpand(CastInst *CI) {
+  if (TLI &&
+      TLI->getTypeAction(CI->getContext(), TLI->getValueType(CI->getType())) ==
+          TargetLowering::TypeExpandInteger)
+    return SinkCast(CI);
+  return false;
+}
+
 /// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
 /// basic block as the load, unless conditions are unfavorable. This allows
 /// SelectionDAG to fold the extend into the load.
@@ -2535,6 +2551,12 @@ bool CodeGenPrepare::MoveExtToFormExtLoa
   if (LI->getParent() == I->getParent())
     return false;
 
+  // Do not undo the optimization in SinkExtExpand
+  if (TLI &&
+      TLI->getTypeAction(I->getContext(), TLI->getValueType(I->getType())) ==
+          TargetLowering::TypeExpandInteger)
+    return false;
+
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
   if (!LI->hasOneUse() &&
@@ -2821,6 +2843,8 @@ bool CodeGenPrepare::OptimizeInst(Instru
       return true;
 
     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
+      if (SinkExtExpand(CI))
+        return true;
       bool MadeChange = MoveExtToFormExtLoad(I);
       return MadeChange | OptimizeExtUses(I);
     }

Modified: llvm/trunk/lib/Target/X86/README.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/README.txt?rev=203230&r1=203229&r2=203230&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/README.txt (original)
+++ llvm/trunk/lib/Target/X86/README.txt Fri Mar  7 05:04:30 2014
@@ -1444,54 +1444,6 @@ it would be nice to produce "into" somed
 
 //===---------------------------------------------------------------------===//
 
-This code:
-
-void vec_mpys1(int y[], const int x[], int scaler) {
-int i;
-for (i = 0; i < 150; i++)
- y[i] += (((long long)scaler * (long long)x[i]) >> 31);
-}
-
-Compiles to this loop with GCC 3.x:
-
-.L5:
-	movl	%ebx, %eax
-	imull	(%edi,%ecx,4)
-	shrdl	$31, %edx, %eax
-	addl	%eax, (%esi,%ecx,4)
-	incl	%ecx
-	cmpl	$149, %ecx
-	jle	.L5
-
-llvm-gcc compiles it to the much uglier:
-
-LBB1_1:	## bb1
-	movl	24(%esp), %eax
-	movl	(%eax,%edi,4), %ebx
-	movl	%ebx, %ebp
-	imull	%esi, %ebp
-	movl	%ebx, %eax
-	mull	%ecx
-	addl	%ebp, %edx
-	sarl	$31, %ebx
-	imull	%ecx, %ebx
-	addl	%edx, %ebx
-	shldl	$1, %eax, %ebx
-	movl	20(%esp), %eax
-	addl	%ebx, (%eax,%edi,4)
-	incl	%edi
-	cmpl	$150, %edi
-	jne	LBB1_1	## bb1
-
-The issue is that we hoist the cast of "scaler" to long long outside of the
-loop, the value comes into the loop as two values, and
-RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
-constructed BUILD_PAIR which represents the cast value.
-
-This can be handled by making CodeGenPrepare sink the cast.
-
-//===---------------------------------------------------------------------===//
-
 Test instructions can be eliminated by using EFLAGS values from arithmetic
 instructions. This is currently not done for mul, and, or, xor, neg, shl,
 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done

Added: llvm/trunk/test/CodeGen/X86/mul128_sext_loop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mul128_sext_loop.ll?rev=203230&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mul128_sext_loop.ll (added)
+++ llvm/trunk/test/CodeGen/X86/mul128_sext_loop.ll Fri Mar  7 05:04:30 2014
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define void @test(i64* nocapture %arr, i64 %arrsize, i64 %factor) nounwind uwtable {
+  %1 = icmp sgt i64 %arrsize, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i64 %factor to i128
+  br label %3
+
+; <label>:3                                       ; preds = %3, %.lr.ph
+; CHECK-NOT: mul
+; CHECK: imulq
+; CHECK-NOT: mul
+  %carry.02 = phi i128 [ 0, %.lr.ph ], [ %10, %3 ]
+  %i.01 = phi i64 [ 0, %.lr.ph ], [ %11, %3 ]
+  %4 = getelementptr inbounds i64* %arr, i64 %i.01
+  %5 = load i64* %4, align 8
+  %6 = sext i64 %5 to i128
+  %7 = mul nsw i128 %6, %2
+  %8 = add nsw i128 %7, %carry.02
+  %.tr = trunc i128 %8 to i64
+  %9 = and i64 %.tr, 9223372036854775807
+  store i64 %9, i64* %4, align 8
+  %10 = ashr i128 %8, 63
+  %11 = add nsw i64 %i.01, 1
+  %exitcond = icmp eq i64 %11, %arrsize
+  br i1 %exitcond, label %._crit_edge, label %3
+
+._crit_edge:                                      ; preds = %3, %0
+  ret void
+}

Added: llvm/trunk/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll?rev=203230&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll (added)
+++ llvm/trunk/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll Fri Mar  7 05:04:30 2014
@@ -0,0 +1,46 @@
+; RUN: opt -codegenprepare -disable-cgp-branch-opts -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The first cast should be sunk into block2, in order that the
+; instruction selector can form an efficient
+; i64 * i64 -> i128 multiplication.
+define i128 @sink(i64* %mem1, i64* %mem2) {
+; CHECK-LABEL: block1:
+; CHECK-NEXT: load
+block1:
+  %l1 = load i64* %mem1
+  %s1 = sext i64 %l1 to i128
+  br label %block2
+
+; CHECK-LABEL: block2:
+; CHECK-NEXT: sext
+; CHECK-NEXT: load
+; CHECK-NEXT: sext
+block2:
+  %l2 = load i64* %mem2
+  %s2 = sext i64 %l2 to i128
+  %res = mul i128 %s1, %s2
+  ret i128 %res
+}
+
+; The first cast should be hoisted into block1, in order that the
+; instruction selector can form an extend-load.
+define i64 @hoist(i32* %mem1, i32* %mem2) {
+; CHECK-LABEL: block1:
+; CHECK-NEXT: load
+; CHECK-NEXT: sext
+block1:
+  %l1 = load i32* %mem1
+  br label %block2
+
+; CHECK-LABEL: block2:
+; CHECK-NEXT: load
+; CHECK-NEXT: sext
+block2:
+  %s1 = sext i32 %l1 to i64
+  %l2 = load i32* %mem2
+  %s2 = sext i32 %l2 to i64
+  %res = mul i64 %s1, %s2
+  ret i64 %res
+}