[llvm] 5110ff0 - [AArch64][CostModel] Fix cost for mul <2 x i64>

Mon Nov 30 03:37:24 PST 2020

Author: Sjoerd Meijer
Date: 2020-11-30T11:36:55Z
New Revision: 5110ff08176f29eefd7638e328d65dfd1c1ad042

URL: https://github.com/llvm/llvm-project/commit/5110ff08176f29eefd7638e328d65dfd1c1ad042
DIFF: https://github.com/llvm/llvm-project/commit/5110ff08176f29eefd7638e328d65dfd1c1ad042.diff

LOG: [AArch64][CostModel] Fix cost for mul <2 x i64>

This was modeled to have a cost of 1, but since we do not have a MUL.2d this is
scalarized into vector inserts/extracts and scalar muls.

Motivating precommitted test is test/Transforms/SLPVectorizer/AArch64/mul.ll,
which we don't want to SLP vectorize.

Test Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
unfortunately needed changing, but the reason is documented in
LoopVectorize.cpp:6855:

  // The cost of executing VF copies of the scalar instruction. This opcode
  // is unknown. Assume that it is the same as 'mul'.

which I will address next as a follow up of this.

Differential Revision: https://reviews.llvm.org/D92208

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/AArch64/mul.ll
    llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 37a34023b8d0..d97570755291 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -644,8 +644,20 @@ int AArch64TTIImpl::getArithmeticInstrCost(
     }
     return Cost;
 
-  case ISD::ADD:
   case ISD::MUL:
+    if (LT.second != MVT::v2i64)
+      return (Cost + 1) * LT.first;
+    // Since we do not have a MUL.2d instruction, a mul <2 x i64> is expensive
+    // as elements are extracted from the vectors and the muls scalarized.
+    // As getScalarizationOverhead is a bit too pessimistic, we estimate the
+    // cost for a i64 vector directly here, which is:
+    // - four i64 extracts,
+    // - two i64 inserts, and
+    // - two muls.
+    // So, for a v2i64 with LT.First = 1 the cost is 8, and for a v4i64 with
+    // LT.first = 2 the cost is 16.
+    return LT.first * 8;
+  case ISD::ADD:
   case ISD::XOR:
   case ISD::OR:
   case ISD::AND:

diff  --git a/llvm/test/Analysis/CostModel/AArch64/mul.ll b/llvm/test/Analysis/CostModel/AArch64/mul.ll
index 6a29c6d772d4..e98463a9fcf4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/mul.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mul.ll
@@ -113,7 +113,7 @@ define <8 x i32> @t12(<8 x i32> %a, <8 x i32> %b)  {
 
 define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b)  {
 ; THROUGHPUT-LABEL: 't13'
-; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = mul nsw <2 x i64> %a, %b
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %1 = mul nsw <2 x i64> %a, %b
 ; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %1
 ;
   %1 = mul nsw <2 x i64> %a, %b
@@ -122,7 +122,7 @@ define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b)  {
 
 define <4 x i64> @t14(<4 x i64> %a, <4 x i64> %b)  {
 ; THROUGHPUT-LABEL: 't14'
-; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = mul nsw <4 x i64> %a, %b
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %1 = mul nsw <4 x i64> %a, %b
 ; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %1
 ;
   %1 = mul nsw <4 x i64> %a, %b

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
index 80d2e282176a..37c1f4eec32a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -9,8 +9,8 @@
 ; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
 
 ; CM: LV: Scalar loop costs: 7.
-; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
-; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+; CM: LV: Found an estimated cost of 19 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 19 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
 
 ; Check that the extractvalue operands are actually free in vector code.
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll
index 228a4d773f0c..7e941adc8cd5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll
@@ -27,8 +27,7 @@ target triple = "aarch64--linux-gnu"
 ;        str     q0, [x0]
 ;        ret
 ;
-; but if we don't SLP vectorise these examples we get this which is smaller
-; and faster:
+; If we don't SLP vectorise but scalarize this we get this instead:
 ;
 ;        ldp     x8, x9, [x1]
 ;        ldp     x10, x11, [x0]
@@ -37,20 +36,19 @@ target triple = "aarch64--linux-gnu"
 ;        stp     x8, x9, [x0]
 ;        ret
 ;
-; FIXME: don't SLP vectorise this.
-
 define void @mul(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) {
 ; CHECK-LABEL: @mul(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[A]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store i64 [[MUL]], i64* [[A]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[MUL4:%.*]] = mul nsw i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store i64 [[MUL4]], i64* [[ARRAYIDX3]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -79,16 +77,18 @@ entry:
 define void @mac(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) {
 ; CHECK-LABEL: @mac(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[TMP4]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[A]] to <2 x i64>*
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[A:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[MUL4:%.*]] = mul nsw i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[MUL]], [[TMP0]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* [[A]], align 8
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i64 [[MUL4]], [[TMP2]]
+; CHECK-NEXT:    store i64 [[ADD9]], i64* [[ARRAYIDX3]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry: