[llvm] r321824 - [AArch64] Improve code generation of vector build
Evandro Menezes via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 4 13:43:12 PST 2018
Author: evandro
Date: Thu Jan 4 13:43:12 2018
New Revision: 321824
URL: http://llvm.org/viewvc/llvm-project?rev=321824&view=rev
Log:
[AArch64] Improve code generation of vector build
Instead of using, for example, `dup v0.4s, wzr`, which transfers between
register files, use the more efficient `movi v0.4s, #0` instead.
Differential revision: https://reviews.llvm.org/D41515
Added:
llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll
Modified:
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll
llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=321824&r1=321823&r2=321824&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Thu Jan 4 13:43:12 2018
@@ -4592,10 +4592,8 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_
def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
-
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -4617,6 +4615,7 @@ def : Pat<(v4i16 (AArch64movi_shift imm0
def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
(MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
// EDIT per word: 2s & 4s with MSL shifter
def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
[(set (v2i32 V64:$Rd),
@@ -4629,13 +4628,31 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMS
def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
"movi", ".8b",
[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
"movi", ".16b",
[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+}
+
+// Use the more efficient MOVI instead of DUP from ZR to zero up vectors
+def : Pat<(v2f32 (AArch64dup (f32 fpimm0))), (MOVIv2i32 (i32 0), (i32 0))>;
+
+def : Pat<(v2i32 (AArch64dup (i32 0))), (MOVIv2i32 (i32 0), (i32 0))>;
+def : Pat<(v4i16 (AArch64dup (i32 0))), (MOVIv4i16 (i32 0), (i32 0))>;
+def : Pat<(v8i8 (AArch64dup (i32 0))), (MOVIv8b_ns (i32 0))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv4i32 (i32 0), (i32 0))>;
+
+def : Pat<(v2i64 (AArch64dup (i64 0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 (AArch64dup (i32 0))), (MOVIv4i32 (i32 0), (i32 0))>;
+def : Pat<(v8i16 (AArch64dup (i32 0))), (MOVIv8i16 (i32 0), (i32 0))>;
+def : Pat<(v16i8 (AArch64dup (i32 0))), (MOVIv16b_ns (i32 0))>;
// AdvSIMD MVNI
// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
@@ -4658,12 +4675,14 @@ def : Pat<(v8i16 (AArch64mvni_shift imm0
(MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
// EDIT per word: 2s & 4s with MSL shifter
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
[(set (v2i32 V64:$Rd),
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
[(set (v4i32 V128:$Rd),
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+}
//----------------------------------------------------------------------------
// AdvSIMD indexed element
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll?rev=321824&r1=321823&r2=321824&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-build-vector.ll Thu Jan 4 13:43:12 2018
@@ -1,23 +1,5 @@
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
-; Check that building up a vector w/ only one non-zero lane initializes
-; intelligently.
-define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
-; CHECK-LABEL: one_lane:
-; CHECK: dup.16b v[[REG:[0-9]+]], wzr
-; CHECK-NEXT: mov.b v[[REG]][0], w1
-; v and q are aliases, and str is preferred against st.16b when possible
-; rdar://11246289
-; CHECK: str q[[REG]], [x0]
-; CHECK: ret
- %conv = trunc i32 %skip0 to i8
- %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
- %tmp = bitcast i32* %out_int to <4 x i32>*
- %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
- store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
- ret void
-}
-
; Check that building a vector from floats doesn't insert an unnecessary
; copy for lane zero.
define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind {
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll?rev=321824&r1=321823&r2=321824&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-vector-insertion.ll Thu Jan 4 13:43:12 2018
@@ -8,7 +8,7 @@ entry:
ret void
; CHECK-LABEL: test0f
- ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000
+ ; CHECK: movi.4s v[[TEMP:[0-9]+]], #0
; CHECK: mov.s v[[TEMP]][0], v{{[0-9]+}}[0]
; CHECK: str q[[TEMP]], [x0]
; CHECK: ret
@@ -16,7 +16,6 @@ entry:
}
-
define void @test1f(float* nocapture %x, float %a) #0 {
entry:
%0 = insertelement <4 x float> <float undef, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float %a, i32 0
Added: llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll?rev=321824&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/build-one-lane.ll Thu Jan 4 13:43:12 2018
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+
+define <8 x i8> @v8i8(i8 %t, i8 %s) nounwind {
+ %v = insertelement <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef>, i8 %s, i32 7
+ ret <8 x i8> %v
+
+; CHECK: movi v[[R:[0-9]+]].8b, #0
+; CHECK: mov v[[R]].b[7], w{{[0-9]+}}
+}
+
+define <16 x i8> @v16i8(i8 %t, i8 %s) nounwind {
+ %v = insertelement <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef>, i8 %s, i32 15
+ ret <16 x i8> %v
+
+; CHECK: movi v[[R:[0-9]+]].16b, #0
+; CHECK: mov v[[R]].b[15], w{{[0-9]+}}
+}
+
+define <4 x i16> @v4i16(i16 %t, i16 %s) nounwind {
+ %v = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, i16 %s, i32 3
+ ret <4 x i16> %v
+
+; CHECK: movi v[[R:[0-9]+]].4h, #0
+; CHECK: mov v[[R]].h[3], w{{[0-9]+}}
+}
+
+define <8 x i16> @v8i16(i16 %t, i16 %s) nounwind {
+ %v = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 undef>, i16 %s, i32 7
+ ret <8 x i16> %v
+
+; CHECK: movi v[[R:[0-9]+]].8h, #0
+; CHECK: mov v[[R]].h[7], w{{[0-9]+}}
+}
+
+define <2 x i32> @v2i32(i32 %t, i32 %s) nounwind {
+ %v = insertelement <2 x i32> <i32 0, i32 undef>, i32 %s, i32 1
+ ret <2 x i32> %v
+
+; CHECK: movi v[[R:[0-9]+]].2s, #0
+; CHECK: mov v[[R]].s[1], w{{[0-9]+}}
+}
+
+define <4 x i32> @v4i32(i32 %t, i32 %s) nounwind {
+ %v = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, i32 %s, i32 3
+ ret <4 x i32> %v
+
+; CHECK: movi v[[R:[0-9]+]].4s, #0
+; CHECK: mov v[[R]].s[3], w{{[0-9]+}}
+}
+
+define <2 x i64> @v2i64(i64 %t, i64 %s) nounwind {
+ %v = insertelement <2 x i64> <i64 0, i64 undef>, i64 %s, i32 1
+ ret <2 x i64> %v
+
+; CHECK: movi v[[R:[0-9]+]].2d, #0
+; CHECK: mov v[[R]].d[1], x{{[0-9]+}}
+}
+
+define <2 x float> @v2f32(float %t, float %s) nounwind {
+ %v = insertelement <2 x float> <float 0.0, float undef>, float %s, i32 1
+ ret <2 x float> %v
+
+; CHECK: movi v[[R:[0-9]+]].2s, #0
+; CHECK: mov v[[R]].s[1], v{{[0-9]+}}.s[0]
+}
+
+define <4 x float> @v4f32(float %t, float %s) nounwind {
+ %v = insertelement <4 x float> <float 0.0, float 0.0, float 0.0, float undef>, float %s, i32 3
+ ret <4 x float> %v
+
+; CHECK: movi v[[R:[0-9]+]].4s, #0
+; CHECK: mov v[[R]].s[3], v{{[0-9]+}}.s[0]
+}
+
+define <2 x double> @v2f64(double %t, double %s) nounwind {
+ %v = insertelement <2 x double> <double 0.0, double undef>, double %s, i32 1
+ ret <2 x double> %v
+
+; CHECK: movi v[[R:[0-9]+]].2d, #0
+; CHECK: mov v[[R]].d[1], v{{[0-9]+}}.d[0]
+}
More information about the llvm-commits
mailing list