[llvm] 28de5f9 - [AArch64] Sink to umull if we know tops bits are zero.

Mon Jan 16 02:44:43 PST 2023

Author: David Green
Date: 2023-01-16T10:44:38Z
New Revision: 28de5f99bbe10d8982500abf5df24a80ab1bae79

URL: https://github.com/llvm/llvm-project/commit/28de5f99bbe10d8982500abf5df24a80ab1bae79
DIFF: https://github.com/llvm/llvm-project/commit/28de5f99bbe10d8982500abf5df24a80ab1bae79.diff

LOG: [AArch64] Sink to umull if we know tops bits are zero.

This is an extension to the code for sinking splats to multiplies, where
if we can detect that the top bits are known-zero we can treat the
instruction like a zext. The existing code was also adjusted in the
process to make it more precise about only sinking if both operands are
zext or sext.

Differential Revision: https://reviews.llvm.org/D141275

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
    llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 476a3ef7a5555..93aac68e0997e 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -14016,12 +14017,20 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
   }
   case Instruction::Mul: {
-    bool IsProfitable = false;
+    int NumZExts = 0, NumSExts = 0;
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
       if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
         continue;
 
+      if (match(&Op, m_SExt(m_Value()))) {
+        NumSExts++;
+        continue;
+      } else if (match(&Op, m_ZExt(m_Value()))) {
+        NumZExts++;
+        continue;
+      }
+
       ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
 
       // If the Shuffle is a splat and the operand is a zext/sext, sinking the
@@ -14031,11 +14040,14 @@ bool AArch64TargetLowering::shouldSinkOperands(
           match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
         Ops.push_back(&Shuffle->getOperandUse(0));
         Ops.push_back(&Op);
-        IsProfitable = true;
+        if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
+          NumSExts++;
+        else
+          NumZExts++;
         continue;
       }
 
-      if (!Shuffle || !Shuffle->isZeroEltSplat())
+      if (!Shuffle)
         continue;
 
       Value *ShuffleOperand = Shuffle->getOperand(0);
@@ -14054,15 +14066,27 @@ bool AArch64TargetLowering::shouldSinkOperands(
         continue;
 
       unsigned Opcode = OperandInstr->getOpcode();
-      if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
-        continue;
+      if (Opcode == Instruction::SExt)
+        NumSExts++;
+      else if (Opcode == Instruction::ZExt)
+        NumZExts++;
+      else {
+        // If we find that the top bits are known 0, then we can sink and allow
+        // the backend to generate a umull.
+        unsigned Bitwidth = I->getType()->getScalarSizeInBits();
+        APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
+        const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
+        if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
+          continue;
+        NumZExts++;
+      }
 
       Ops.push_back(&Shuffle->getOperandUse(0));
       Ops.push_back(&Op);
-      IsProfitable = true;
     }
 
-    return IsProfitable;
+    // Is it profitable to sink if we found two of the same type of extends.
+    return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
   }
   default:
     return false;

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 827f0ea1b1942..49a15528c041a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -427,13 +427,12 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
-; CHECK-NEXT:    dup v2.8b, w9
 ; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    add x8, x0, #8
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    mov x12, x11
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    dup v2.8h, w9
 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]
@@ -704,9 +703,8 @@ define void @matrix_mul_unsigned_and(i32 %N, i32* nocapture %C, i16* nocapture r
 ; CHECK:       // %bb.0: // %vector.header
 ; CHECK-NEXT:    and w8, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    dup v0.4h, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:  .LBB10_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
@@ -767,27 +765,24 @@ for.end12:                                        ; preds = %vector.body
 define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 ; CHECK-LABEL: matrix_mul_unsigned_and_double:
 ; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w8, w3, #0xffff
+; CHECK-NEXT:    and w9, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    dup v0.8h, w9
 ; CHECK-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
+; CHECK-NEXT:    add x10, x1, w0, uxtw #2
 ; CHECK-NEXT:    subs x8, x8, #16
+; CHECK-NEXT:    add w0, w0, #16
 ; CHECK-NEXT:    ldr q1, [x9]
 ; CHECK-NEXT:    ldur q2, [x9, #8]
-; CHECK-NEXT:    add x9, x1, w0, uxtw #2
-; CHECK-NEXT:    add w0, w0, #16
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    umull2 v3.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    umull2 v4.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q1, q3, [x9]
-; CHECK-NEXT:    stp q2, q4, [x9, #32]
+; CHECK-NEXT:    stp q1, q3, [x10]
+; CHECK-NEXT:    stp q2, q4, [x10, #32]
 ; CHECK-NEXT:    b.ne .LBB11_1
 ; CHECK-NEXT:  // %bb.2: // %for.end12
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
index acba83364210a..f29054bd06211 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
@@ -313,12 +313,12 @@ define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {
 ; CHECK-NEXT:  for.cond4.preheader.lr.ph:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0
 ; CHECK-NEXT:    [[CONV25:%.*]] = sext i16 [[E:%.*]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT143:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]]
 ; CHECK:       for.cond4.preheader.us.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT143]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 ; CHECK:       for.cond4.preheader.preheader:
 ; CHECK-NEXT:    ret <4 x i32> zeroinitializer
 ;