[llvm] dbaa6a1 - Revert "[AArch64] Attempt to sink mul operands"

Martin Storsjö via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 14 07:29:00 PST 2021

Author: Martin Storsjö
Date: 2021-01-14T17:28:18+02:00
New Revision: dbaa6a1858a42f72b683f700d3bd7a9632f7a518

URL: https://github.com/llvm/llvm-project/commit/dbaa6a1858a42f72b683f700d3bd7a9632f7a518
DIFF: https://github.com/llvm/llvm-project/commit/dbaa6a1858a42f72b683f700d3bd7a9632f7a518.diff

LOG: Revert "[AArch64] Attempt to sink mul operands"

This reverts commit dda60035e9f0769c8907cdf6561489e0435c2275.

This commit caused failures to compile some sources, erroring out
with "error in backend: Cannot select: t85: v2i32 = AArch64ISD::DUP t15",
see https://reviews.llvm.org/D91271 for the full reproduction case.




diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 082fdf390786..b500cd534a1f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10956,43 +10956,6 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
-  case Instruction::Mul: {
-    bool IsProfitable = false;
-    for (auto &Op : I->operands()) {
-      // Make sure we are not already sinking this operand
-      if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
-        continue;
-      ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
-      if (!Shuffle || !Shuffle->isZeroEltSplat())
-        continue;
-      Value *ShuffleOperand = Shuffle->getOperand(0);
-      InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
-      if (!Insert)
-        continue;
-      Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
-      if (!OperandInstr)
-        continue;
-      ConstantInt *ElementConstant =
-          dyn_cast<ConstantInt>(Insert->getOperand(2));
-      // Check that the insertelement is inserting into element 0
-      if (!ElementConstant || ElementConstant->getZExtValue() != 0)
-        continue;
-      unsigned Opcode = OperandInstr->getOpcode();
-      if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
-        continue;
-      Ops.push_back(&Shuffle->getOperandUse(0));
-      Ops.push_back(&Op);
-      IsProfitable = true;
-    }
-    return IsProfitable;
-  }
     return false;

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
deleted file mode 100644
index 966cf7b46daa..000000000000
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ /dev/null
@@ -1,186 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
-define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
-; CHECK-LABEL: matrix_mul_unsigned:
-; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w9, w3, #0xffff
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-NEXT:    dup v0.4h, w9
-; CHECK-NEXT:  .LBB0_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x2, w0, uxtw #1
-; CHECK-NEXT:    ldp d1, d2, [x9]
-; CHECK-NEXT:    add x9, x1, w0, uxtw #2
-; CHECK-NEXT:    subs x8, x8, #8 // =8
-; CHECK-NEXT:    add w0, w0, #8 // =8
-; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    stp q1, q2, [x9]
-; CHECK-NEXT:    b.ne .LBB0_1
-; CHECK-NEXT:  // %bb.2: // %for.end12
-; CHECK-NEXT:    ret
-  %conv4 = zext i16 %val to i32
-  %wide.trip.count = zext i32 %N to i64
-  %0 = add nsw i64 %wide.trip.count, -1
-  %min.iters.check = icmp ult i32 %N, 8
-  %1 = trunc i64 %0 to i32
-  %2 = icmp ugt i64 %0, 4294967295
-  %n.vec = and i64 %wide.trip.count, 4294967288
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
-  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
-  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
-  br label %vector.body
-vector.body:                                      ; preds = %vector.header, %vector.body
-  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
-  %3 = trunc i64 %index to i32
-  %4 = add i32 %N, %3
-  %5 = zext i32 %4 to i64
-  %6 = getelementptr inbounds i16, i16* %A, i64 %5
-  %7 = bitcast i16* %6 to <4 x i16>*
-  %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
-  %8 = getelementptr inbounds i16, i16* %6, i64 4
-  %9 = bitcast i16* %8 to <4 x i16>*
-  %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
-  %10 = zext <4 x i16> %wide.load to <4 x i32>
-  %11 = zext <4 x i16> %wide.load30 to <4 x i32>
-  %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10
-  %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11
-  %14 = getelementptr inbounds i32, i32* %C, i64 %5
-  %15 = bitcast i32* %14 to <4 x i32>*
-  store <4 x i32> %12, <4 x i32>* %15, align 4
-  %16 = getelementptr inbounds i32, i32* %14, i64 4
-  %17 = bitcast i32* %16 to <4 x i32>*
-  store <4 x i32> %13, <4 x i32>* %17, align 4
-  %index.next = add i64 %index, 8
-  %18 = icmp eq i64 %index.next, %n.vec
-  br i1 %18, label %for.end12, label %vector.body
-for.end12:                                        ; preds = %vector.body
-  ret void
-define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
-; CHECK-LABEL: matrix_mul_signed:
-; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    sxth w9, w3
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-NEXT:    dup v0.4h, w9
-; CHECK-NEXT:  .LBB1_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x2, w0, sxtw #1
-; CHECK-NEXT:    ldp d1, d2, [x9]
-; CHECK-NEXT:    add x9, x1, w0, sxtw #2
-; CHECK-NEXT:    subs x8, x8, #8 // =8
-; CHECK-NEXT:    add w0, w0, #8 // =8
-; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    stp q1, q2, [x9]
-; CHECK-NEXT:    b.ne .LBB1_1
-; CHECK-NEXT:  // %bb.2: // %for.end12
-; CHECK-NEXT:    ret
-  %conv4 = sext i16 %val to i32
-  %wide.trip.count = sext i32 %N to i64
-  %0 = add nsw i64 %wide.trip.count, -1
-  %min.iters.check = icmp ult i32 %N, 8
-  %1 = trunc i64 %0 to i32
-  %2 = icmp ugt i64 %0, 4294967295
-  %n.vec = and i64 %wide.trip.count, 4294967288
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0
-  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer
-  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
-  br label %vector.body
-vector.body:                                      ; preds = %vector.header, %vector.body
-  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
-  %3 = trunc i64 %index to i32
-  %4 = add i32 %N, %3
-  %5 = sext i32 %4 to i64
-  %6 = getelementptr inbounds i16, i16* %A, i64 %5
-  %7 = bitcast i16* %6 to <4 x i16>*
-  %wide.load = load <4 x i16>, <4 x i16>* %7, align 2
-  %8 = getelementptr inbounds i16, i16* %6, i64 4
-  %9 = bitcast i16* %8 to <4 x i16>*
-  %wide.load30 = load <4 x i16>, <4 x i16>* %9, align 2
-  %10 = sext <4 x i16> %wide.load to <4 x i32>
-  %11 = sext <4 x i16> %wide.load30 to <4 x i32>
-  %12 = mul nsw <4 x i32> %broadcast.splat, %10
-  %13 = mul nsw <4 x i32> %broadcast.splat32, %11
-  %14 = getelementptr inbounds i32, i32* %C, i64 %5
-  %15 = bitcast i32* %14 to <4 x i32>*
-  store <4 x i32> %12, <4 x i32>* %15, align 4
-  %16 = getelementptr inbounds i32, i32* %14, i64 4
-  %17 = bitcast i32* %16 to <4 x i32>*
-  store <4 x i32> %13, <4 x i32>* %17, align 4
-  %index.next = add i64 %index, 8
-  %18 = icmp eq i64 %index.next, %n.vec
-  br i1 %18, label %for.end12, label %vector.body
-for.end12:                                        ; preds = %vector.body
-  ret void
-define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) {
-; CHECK-LABEL: matrix_mul_double_shuffle:
-; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w9, w3, #0xffff
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-NEXT:    dup v0.4h, w9
-; CHECK-NEXT:  .LBB2_1: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrh w9, [x2], #16
-; CHECK-NEXT:    mov w10, w0
-; CHECK-NEXT:    subs x8, x8, #8 // =8
-; CHECK-NEXT:    lsl x10, x10, #2
-; CHECK-NEXT:    dup v1.4h, w9
-; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT:    add w0, w0, #8 // =8
-; CHECK-NEXT:    str q1, [x1, x10]
-; CHECK-NEXT:    b.ne .LBB2_1
-; CHECK-NEXT:  // %bb.2: // %for.end12
-; CHECK-NEXT:    ret
-  %conv4 = zext i16 %val to i32
-  %wide.trip.count = zext i32 %N to i64
-  %0 = add nsw i64 %wide.trip.count, -1
-  %min.iters.check = icmp ult i32 %N, 8
-  %1 = trunc i64 %0 to i32
-  %2 = icmp ugt i64 %0, 4294967295
-  %n.vec = and i64 %wide.trip.count, 4294967288
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
-  br label %vector.body
-vector.body:                                      ; preds = %vector.header, %vector.body
-  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ]
-  %g = getelementptr inbounds i16, i16* %A, i64 %index
-  %val1 = load i16, i16* %g
-  %splat.input.ext = zext i16 %val1 to i32
-  %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0
-  %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  %3 = trunc i64 %index to i32
-  %4 = add i32 %N, %3
-  %5 = zext i32 %4 to i64
-  %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32
-  %7 = getelementptr inbounds i32, i32* %C, i64 %5
-  %8 = bitcast i32* %7 to <4 x i32>*
-  store <4 x i32> %6, <4 x i32>* %8, align 4
-  %index.next = add i64 %index, 8
-  %9 = icmp eq i64 %index.next, %n.vec
-  br i1 %9, label %for.end12, label %vector.body
-for.end12:                                        ; preds = %vector.body
-  ret void


