[llvm] r372824 - [AArch64] Convert neon_ushl and neon_sshl with positive constants to VSHL.

Wed Sep 25 01:22:05 PDT 2019

Author: fhahn
Date: Wed Sep 25 01:22:05 2019
New Revision: 372824

URL: http://llvm.org/viewvc/llvm-project?rev=372824&view=rev
Log:
[AArch64] Convert neon_ushl and neon_sshl with positive constants to VSHL.

I think we should be able to use shl instead of sshl and ushl for
positive constant shift values, unless I am missing something.

We already have the machinery in place to ensure we only replace
nodes, if the shift value is positive and <= the element width.

This is a generalization of an earlier patch rL372565.

Reviewers: t.p.northover, samparker, dmgreen, anemet

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D67955

Modified:
    llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp?rev=372824&r1=372823&r2=372824&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp Wed Sep 25 01:22:05 2019
@@ -10333,29 +10333,14 @@ static SDValue tryCombineShiftImm(unsign
     IsRightShift = false;
     break;
   case Intrinsic::aarch64_neon_sshl:
-  case Intrinsic::aarch64_neon_ushl: {
-    // ushll/ushll2 provide unsigned shifts with immediate operands and
-    // sshll/sshll2 provide signed shifts with immediates, so we have to make
-    // sure we only match patterns here we can later match to them.
-    SDValue Op0 = N->getOperand(1);
-    if (Op0.getNode()->getOpcode() != (IID == Intrinsic::aarch64_neon_ushl
-                                           ? ISD::ZERO_EXTEND
-                                           : ISD::SIGN_EXTEND))
-      return SDValue();
-
-    EVT FromType = Op0.getOperand(0).getValueType();
-    EVT ToType = Op0.getValueType();
-    unsigned FromSize = FromType.getScalarSizeInBits();
-    if (!FromType.isVector() || !ToType.isVector() ||
-        (FromSize != 8 && FromSize != 16 && FromSize != 32) ||
-        2 * FromSize != ToType.getScalarSizeInBits())
-      return SDValue();
-
+  case Intrinsic::aarch64_neon_ushl:
+    // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
+    // left shift for positive shift amounts. Below, we only replace the current
+    // node with VSHL, if this condition is met.
     Opcode = AArch64ISD::VSHL;
     IsRightShift = false;
     break;
   }
-  }
 
   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
     SDLoc dl(N);

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll?rev=372824&r1=372823&r2=372824&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vshift.ll Wed Sep 25 01:22:05 2019
@@ -1208,27 +1208,25 @@ define <8 x i16> @neon.ushll8h_constant_
 
 define <8 x i16> @neon.ushl8h_no_constant_shift(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: neon.ushl8h_no_constant_shift
-;CHECK:	ushl.8h	v0, v0, v0
+;CHECK: ushl.8h v0, v0, v0
   %tmp1 = load <8 x i8>, <8 x i8>* %A
   %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
-; Here we do not extend to the double the bitwidth, so we cannot fold to ushll.
-define <4 x i32> @neon.ushll8h_constant_shift_extend_not_2x(<4 x i8>* %A) nounwind {
-;CHECK-LABEL: @neon.ushll8h_constant_shift_extend_not_2x
+define <4 x i32> @neon.ushl8h_constant_shift_extend_not_2x(<4 x i8>* %A) nounwind {
+;CHECK-LABEL: @neon.ushl8h_constant_shift_extend_not_2x
 ;CHECK-NOT: ushll.8h v0,
-;CHECK:	ldrb	w8, [x0]
-;CHECK:	movi.4s	v1, #1
-;CHECK:	fmov	s0, w8
-;CHECK:	ldrb	w8, [x0, #1]
-;CHECK:	mov.s	v0[1], w8
-;CHECK:	ldrb	w8, [x0, #2]
-;CHECK:	mov.s	v0[2], w8
-;CHECK:	ldrb	w8, [x0, #3]
-;CHECK:	mov.s	v0[3], w8
-;CHECK:	ushl.4s	v0, v0, v1
+;CHECK: ldrb    w8, [x0]
+;CHECK: fmov    s0, w8
+;CHECK: ldrb    w8, [x0, #1]
+;CHECK: mov.s   v0[1], w8
+;CHECK: ldrb    w8, [x0, #2]
+;CHECK: mov.s   v0[2], w8
+;CHECK: ldrb    w8, [x0, #3]
+;CHECK: mov.s   v0[3], w8
+;CHECK: shl.4s v0, v0, #1
   %tmp1 = load <4 x i8>, <4 x i8>* %A
   %tmp2 = zext <4 x i8> %tmp1 to <4 x i32>
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -1238,8 +1236,7 @@ define <4 x i32> @neon.ushll8h_constant_
 define <8 x i16> @neon.ushl8_noext_constant_shift(<8 x i16>* %A) nounwind {
 ; CHECK-LABEL: neon.ushl8_noext_constant_shift
 ; CHECK:      ldr       q0, [x0]
-; CHECK-NEXT: movi.8h   v1, #1
-; CHECK-NEXT: ushl.8h   v0, v0, v1
+; CHECK-NEXT: shl.8h   v0, v0, #1
 ; CHECK-NEXT: ret
   %tmp1 = load <8 x i16>, <8 x i16>* %A
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
@@ -1270,8 +1267,7 @@ define <4 x i32> @neon.ushll4s_neg_const
 ; FIXME: should be constant folded.
 define <4 x i32> @neon.ushll4s_constant_fold() nounwind {
 ; CHECK-LABEL: neon.ushll4s_constant_fold
-; CHECK:      movi.4s v1, #1
-; CHECK-NEXT: ushl.4s v0, v0, v1
+; CHECK: shl.4s v0, v0, #1
 ;
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
   ret <4 x i32> %tmp3
@@ -1311,12 +1307,28 @@ declare <2 x i64> @llvm.aarch64.neon.ssh
 
 define <16 x i8> @neon.sshl16b_constant_shift(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: neon.sshl16b_constant_shift
-;CHECK: sshl.16b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+;CHECK: shl.16b {{v[0-9]+}}, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>, <16 x i8>* %A
         %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp2
 }
 
+define <16 x i8> @neon.sshl16b_non_splat_constant_shift(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: neon.sshl16b_non_splat_constant_shift
+;CHECK: sshl.16b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+        %tmp1 = load <16 x i8>, <16 x i8>* %A
+        %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @neon.sshl16b_neg_constant_shift(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: neon.sshl16b_neg_constant_shift
+;CHECK: sshl.16b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+        %tmp1 = load <16 x i8>, <16 x i8>* %A
+        %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
+        ret <16 x i8> %tmp2
+}
+
 define <8 x i16> @neon.sshll8h_constant_shift(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: neon.sshll8h_constant_shift
 ;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
@@ -1328,14 +1340,21 @@ define <8 x i16> @neon.sshll8h_constant_
 
 define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(<4 x i8>* %A) nounwind {
 ;CHECK-LABEL: neon.sshl4s_wrong_ext_constant_shift
-;CHECK: sshl.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+;CHECK:       ldrsb   w8, [x0]
+;CHECK-NEXT:  fmov    s0, w8
+;CHECK-NEXT:  ldrsb   w8, [x0, #1]
+;CHECK-NEXT:  mov.s   v0[1], w8
+;CHECK-NEXT:  ldrsb   w8, [x0, #2]
+;CHECK-NEXT:  mov.s   v0[2], w8
+;CHECK-NEXT:  ldrsb   w8, [x0, #3]
+;CHECK-NEXT:  mov.s   v0[3], w8
+;CHECK-NEXT:  shl.4s  v0, v0, #1
         %tmp1 = load <4 x i8>, <4 x i8>* %A
         %tmp2 = sext <4 x i8> %tmp1 to <4 x i32>
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
-
 define <4 x i32> @neon.sshll4s_constant_shift(<4 x i16>* %A) nounwind {
 ;CHECK-LABEL: neon.sshll4s_constant_shift
 ;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
@@ -1359,14 +1378,14 @@ define <4 x i32> @neon.sshll4s_neg_const
 ; FIXME: should be constant folded.
 define <4 x i32> @neon.sshl4s_constant_fold() nounwind {
 ;CHECK-LABEL: neon.sshl4s_constant_fold
-;CHECK: sshl.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+;CHECK: shl.4s {{v[0-9]+}}, {{v[0-9]+}}, #2
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
         ret <4 x i32> %tmp3
 }
 
 define <4 x i32> @neon.sshl4s_no_fold(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: neon.sshl4s_no_fold
-;CHECK: sshl.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+;CHECK: shl.4s {{v[0-9]+}}, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>, <4 x i32>* %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
@@ -1384,14 +1403,14 @@ define <2 x i64> @neon.sshll2d_constant_
 ; FIXME: should be constant folded.
 define <2 x i64> @neon.sshl2d_constant_fold() nounwind {
 ;CHECK-LABEL: neon.sshl2d_constant_fold
-;CHECK: sshl.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+;CHECK: shl.2d {{v[0-9]+}}, {{v[0-9]+}}, #1
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
 define <2 x i64> @neon.sshl2d_no_fold(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: neon.sshl2d_no_fold
-;CHECK: sshl.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+;CHECK: shl.2d {{v[0-9]+}}, {{v[0-9]+}}, #2
         %tmp2 = load <2 x i64>, <2 x i64>* %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> <i64 2, i64 2>)
         ret <2 x i64> %tmp3