[llvm] r347734 - [SystemZ::TTI] Improve costs for i16 add, sub and mul against memory.

Wed Nov 28 00:31:50 PST 2018

Author: jonpa
Date: Wed Nov 28 00:31:50 2018
New Revision: 347734

URL: http://llvm.org/viewvc/llvm-project?rev=347734&view=rev
Log:
[SystemZ::TTI]  Improve costs for i16 add, sub and mul against memory.

AH, SH and MH costs are already covered in the cases where LHS is 32 bits and
RHS is 16 bits of memory sign-extended to i32.

As these instructions are also used when LHS is i16, this patch recognizes
that the loads will get folded then as well.

Review: Ulrich Weigand
https://reviews.llvm.org/D54940

Modified:
    llvm/trunk/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll

Modified: llvm/trunk/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp?rev=347734&r1=347733&r2=347734&view=diff
==============================================================================

--- llvm/trunk/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp Wed Nov 28 00:31:50 2018
@@ -908,6 +908,10 @@ isFoldableLoad(const LoadInst *Ld, const
        UserI->getOpcode() == Instruction::UDiv) &&
       UserI->getOperand(1) != FoldedValue)
     return false; // Not commutative, only RHS foldable.
+  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
+  // extension was made of the load.
+  unsigned LoadOrTruncBits =
+      ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
   switch (UserI->getOpcode()) {
   case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
   case Instruction::Sub:
@@ -919,6 +923,8 @@ isFoldableLoad(const LoadInst *Ld, const
         (SExtBits == 32 ||
          (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
       return true;
+    if (LoadOrTruncBits == 16)
+      return true;
     LLVM_FALLTHROUGH;
   case Instruction::SDiv:// SE: 32->64
     if (LoadedBits == 32 && SExtBits == 64)
@@ -938,16 +944,12 @@ isFoldableLoad(const LoadInst *Ld, const
     // case Instruction::FDiv:
 
     // All possible extensions of memory checked above.
-    if (SExtBits || ZExtBits)
-      return false;
 
     // Comparison between memory and immediate.
     if (UserI->getOpcode() == Instruction::ICmp)
       if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
         if (isUInt<16>(CI->getZExtValue()))
           return true;
-
-    unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
     return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
     break;
   }

Modified: llvm/trunk/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll?rev=347734&r1=347733&r2=347734&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll Wed Nov 28 00:31:50 2018
@@ -85,6 +85,37 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = add i32 %sext_3, undef
 }
 
+define void @add_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
+  %L1 = load i16, i16* %Src1
+  %S0 = add i16 %L1, %Arg
+  store volatile i16 %S0, i16* %Dst
+
+  %L2 = load i16, i16* %Src1
+  %L3 = load i16, i16* %Src2
+  %S1 = add i16 %L2, %L3
+  store volatile i16 %S1, i16* %Dst
+
+  ; Truncated load
+  %L32 = load i32, i32* %Src32
+  %tr = trunc i32 %L32 to i16
+  %S2 = add i16 %tr, %Arg
+  store volatile i16 %S2, i16* %Dst
+
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'add_i16_mem16':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L1 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S0 = add i16 %L1, %Arg
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %S0, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L2 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %L3 = load i16, i16* %Src2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S1 = add i16 %L2, %L3
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %S1, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L32 = load i32, i32* %Src32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i32 %L32 to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %S2 = add i16 %tr, %Arg
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %S2, i16* %Dst
+}
+
 define void @sub_lhs_mem() {
   %li32 = load i32, i32* undef
   sub i32 %li32, undef
@@ -228,6 +259,37 @@ define void @sub_rhs_mem() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i32 undef, %sext_3
 }
 
+define void @sub_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
+  %L1 = load i16, i16* %Src1
+  %D0 = sub i16 %Arg, %L1
+  store volatile i16 %D0, i16* %Dst
+
+  %L2 = load i16, i16* %Src1
+  %L3 = load i16, i16* %Src2
+  %D1 = sub i16 %L2, %L3
+  store volatile i16 %D1, i16* %Dst
+
+  ; Truncated load
+  %L32 = load i32, i32* %Src32
+  %tr = trunc i32 %L32 to i16
+  %D2 = sub i16 %Arg, %tr
+  store volatile i16 %D2, i16* %Dst
+
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'sub_i16_mem16':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L1 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %D0 = sub i16 %Arg, %L1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %D0, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %L2 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L3 = load i16, i16* %Src2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %D1 = sub i16 %L2, %L3
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %D1, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L32 = load i32, i32* %Src32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i32 %L32 to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %D2 = sub i16 %Arg, %tr
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %D2, i16* %Dst
+}
+
 define void @mul() {
   %li32 = load i32, i32* undef
   mul i32 %li32, undef
@@ -305,6 +367,37 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = mul i32 %sext_3, undef
 }
 
+define void @mul_i16_mem16(i16 %Arg, i16* %Src1, i16* %Src2, i16* %Dst, i32* %Src32) {
+  %L1 = load i16, i16* %Src1
+  %P0 = mul i16 %Arg, %L1
+  store volatile i16 %P0, i16* %Dst
+
+  %L2 = load i16, i16* %Src1
+  %L3 = load i16, i16* %Src2
+  %P1 = mul i16 %L2, %L3
+  store volatile i16 %P1, i16* %Dst
+
+  ; Truncated load
+  %L32 = load i32, i32* %Src32
+  %tr = trunc i32 %L32 to i16
+  %P2 = mul i16 %Arg, %tr
+  store volatile i16 %P2, i16* %Dst
+
+  ret void
+; CHECK: Printing analysis 'Cost Model Analysis' for function 'mul_i16_mem16':
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L1 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %P0 = mul i16 %Arg, %L1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %P0, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L2 = load i16, i16* %Src1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %L3 = load i16, i16* %Src2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %P1 = mul i16 %L2, %L3
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %P1, i16* %Dst
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %L32 = load i32, i32* %Src32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i32 %L32 to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %P2 = mul i16 %Arg, %tr
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store volatile i16 %P2, i16* %Dst
+}
+
 define void @sdiv_lhs(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
   sdiv i32 %li32, %arg32