[PATCH] D60890: [AArch64] splat before (f)mul to allow mul-by-element isel

Thu Apr 18 15:07:05 PDT 2019

spatel created this revision.
spatel added reviewers: efriedma, sdesmalen, dmgreen.
Herald added subscribers: hiraditya, kristof.beyls, javed.absar, mcrosier.
Herald added a project: LLVM.

A splat of a vector multiply (either integer or FP) can be turned into a multiply-by-element:

  splat (mul X, Y), Lane --> mul (splat X, Lane), (splat Y, Lane) --> mul-by-element (splat X, Lane), Y.[Lane]

These patterns showed up as an ARM regression in D60214 <https://reviews.llvm.org/D60214>, but we have this transform in IR, so it's an existing problem IIUC.

The constant cases look better, but I'm not sure if this is a win if both operands are variables.


https://reviews.llvm.org/D60890

Files:
  llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
  llvm/test/CodeGen/AArch64/mul_by_elt.ll


Index: llvm/test/CodeGen/AArch64/mul_by_elt.ll
===================================================================

--- llvm/test/CodeGen/AArch64/mul_by_elt.ll
+++ llvm/test/CodeGen/AArch64/mul_by_elt.ll
@@ -19,8 +19,7 @@
 ; CHECK-LABEL: splat0_after_fmul_constant:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov v1.4s, #3.00000000
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.s[0]
 ; CHECK-NEXT:    ret
   %mul = fmul <4 x float> %a, <float 3.0, float 42.0, float 3.0, float 3.0>
   %splat = shufflevector <4 x float> %mul, <4 x float> undef, <4 x i32> zeroinitializer
@@ -44,8 +43,7 @@
 ; CHECK-LABEL: splat1_after_fmul_constant:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov v1.2d, #5.00000000
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    dup v0.2d, v0.d[1]
+; CHECK-NEXT:    fmul v0.2d, v1.2d, v0.d[1]
 ; CHECK-NEXT:    ret
   %mul = fmul <2 x double> %a, <double -1.0, double 5.0>
   %splat = shufflevector <2 x double> %mul, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -57,8 +55,8 @@
 define <2 x double> @splat1_before_fmul(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: splat1_before_fmul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    dup v0.2d, v0.d[1]
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[1]
 ; CHECK-NEXT:    ret
   %splata = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   %splatb = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -69,8 +67,8 @@
 define <2 x double> @splat1_after_fmul(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: splat1_after_fmul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    dup v0.2d, v0.d[1]
+; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.d[1]
 ; CHECK-NEXT:    ret
   %mul = fmul <2 x double> %a, %b
   %splat = shufflevector <2 x double> %mul, <2 x double> undef, <2 x i32> <i32 1, i32 1>
@@ -94,8 +92,7 @@
 ; CHECK-LABEL: splat2_after_mul_constant:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #3
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    dup v0.4s, v0.s[2]
+; CHECK-NEXT:    mul v0.4s, v1.4s, v0.s[2]
 ; CHECK-NEXT:    ret
   %mul = mul <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
   %splat = shufflevector <4 x i32> %mul, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
@@ -105,8 +102,8 @@
 define <8 x i16> @splat1_before_mul(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: splat1_before_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    dup v0.8h, v0.h[1]
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[1]
 ; CHECK-NEXT:    ret
   %splata = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %splatb = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -117,11 +114,10 @@
 define <8 x i16> @splat1_after_mul(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: splat1_after_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    dup v0.8h, v0.h[1]
+; CHECK-NEXT:    mul v0.8h, v0.8h, v1.h[1]
 ; CHECK-NEXT:    ret
   %mul = mul <8 x i16> %a, %b
   %splat = shufflevector <8 x i16> %mul, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <8 x i16> %splat
 }
-
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6597,6 +6597,17 @@
         !isa<ConstantSDNode>(V1.getOperand(Lane)))
       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
 
+    // Splat multiply operands to allow selecting this as an (f)mul by element:
+    // splat (mul X, Y), Lane --> mul (splat X, Lane), (splat Y, Lane)
+    if (V1.hasOneUse() &&
+        (V1.getOpcode() == ISD::FMUL || V1.getOpcode() == ISD::MUL)) {
+      SDValue SplatX = DAG.getVectorShuffle(VT, dl, V1.getOperand(0),
+                                            DAG.getUNDEF(VT), ShuffleMask);
+      SDValue SplatY = DAG.getVectorShuffle(VT, dl, V1.getOperand(1),
+                                            DAG.getUNDEF(VT), ShuffleMask);
+      return DAG.getNode(V1.getOpcode(), dl, VT, SplatX, SplatY);
+    }
+
     // Otherwise, duplicate from the lane of the input vector.
     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D60890.195817.patch
Type: text/x-patch
Size: 4576 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190418/a3e9b2c3/attachment-0001.bin>