[llvm] r332251 - [AArch64] Improve single vector lane stores
Evandro Menezes via llvm-commits
llvm-commits at lists.llvm.org
Mon May 14 08:26:35 PDT 2018
Author: evandro
Date: Mon May 14 08:26:35 2018
New Revision: 332251
URL: http://llvm.org/viewvc/llvm-project?rev=332251&view=rev
Log:
[AArch64] Improve single vector lane stores
When storing the 0th lane of a vector, use a simpler and usually more efficient scalar store instead.
Differential revision: https://reviews.llvm.org/D46655
Modified:
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
llvm/trunk/test/CodeGen/AArch64/arm64-neon-copy.ll
llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll
llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=332251&r1=332250&r2=332251&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Mon May 14 08:26:35 2018
@@ -2249,12 +2249,11 @@ multiclass VecROStoreLane0Pat<ROAddrMode
let AddedComplexity = 19 in {
defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
- defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>;
- defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
- defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>;
- defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>;
- defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>;
- defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
+ defm : VecROStoreLane0Pat<ro16, store, v8f16, f16, hsub, STRHroW, STRHroX>;
+ defm : VecROStoreLane0Pat<ro32, store, v4i32, i32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro32, store, v4f32, f32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro64, store, v2i64, i64, dsub, STRDroW, STRDroX>;
+ defm : VecROStoreLane0Pat<ro64, store, v2f64, f64, dsub, STRDroW, STRDroX>;
}
//---
@@ -2288,8 +2287,16 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR
(am_indexed8 GPR64sp:$Rn,
uimm12s1:$offset))]>;
-// Match all store 64 bits width whose type is compatible with FPR64
let AddedComplexity = 10 in {
+
+// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v1i64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1f64 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v2f32 FPR64:$Rt),
@@ -2308,14 +2315,12 @@ let Predicates = [IsLE] in {
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
}
-def : Pat<(store (v1f64 FPR64:$Rt),
- (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
- (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt),
- (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
- (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (f128 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v4f32 FPR128:$Rt),
@@ -2340,9 +2345,6 @@ let Predicates = [IsLE] in {
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
}
-def : Pat<(store (f128 FPR128:$Rt),
- (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
- (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
// truncstore i64
def : Pat<(truncstorei32 GPR64:$Rt,
@@ -2356,6 +2358,26 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_i
} // AddedComplexity = 10
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreLane0Pat<Operand UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR> {
+ def : Pat<(storeop (STy (vector_extract (VTy VecListOne128:$Vt), 0)),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+ GPR64sp:$Rn, IndexType:$offset)>;
+}
+
+let AddedComplexity = 19 in {
+ defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, hsub, uimm12s2, STRHui>;
+ defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, hsub, uimm12s2, STRHui>;
+ defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, ssub, uimm12s4, STRSui>;
+ defm : VecStoreLane0Pat<am_indexed32, store, v4f32, f32, ssub, uimm12s4, STRSui>;
+ defm : VecStoreLane0Pat<am_indexed64, store, v2i64, i64, dsub, uimm12s8, STRDui>;
+ defm : VecStoreLane0Pat<am_indexed64, store, v2f64, f64, dsub, uimm12s8, STRDui>;
+}
+
//---
// (unscaled immediate)
defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64z, "stur",
@@ -2387,6 +2409,13 @@ defm STURBB : StoreUnscaled<0b00, 0, 0b0
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
// Match all store 64 bits width whose type is compatible with FPR64
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+let AddedComplexity = 10 in {
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v2f32 FPR64:$Rt),
@@ -2405,12 +2434,11 @@ let Predicates = [IsLE] in {
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
-def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
- (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
-def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
- (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
// Match all store 128 bits width whose type is compatible with FPR128
+def : Pat<(store (f128 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
let Predicates = [IsLE] in {
// We must use ST1 to store vectors in big-endian.
def : Pat<(store (v4f32 FPR128:$Rt),
@@ -2439,6 +2467,8 @@ let Predicates = [IsLE] in {
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
+} // AddedComplexity = 10
+
// unscaled i64 truncating stores
def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
@@ -4151,12 +4181,18 @@ def : Pat<(v4i16 (scalar_to_vector GPR32
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
+
def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
@@ -4170,6 +4206,7 @@ def : Pat<(v4f32 (scalar_to_vector (f32
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+
def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-neon-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-neon-copy.ll?rev=332251&r1=332250&r2=332251&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-neon-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-copy.ll Mon May 14 08:26:35 2018
@@ -925,7 +925,7 @@ define <4 x i16> @test_extracts_inserts_
; CHECK-LABEL: test_extracts_inserts_varidx_insert:
; CHECK: and [[MASKED_IDX:x[0-9]+]], x0, #0x3
; CHECK: bfi x9, [[MASKED_IDX]], #1, #2
-; CHECK: st1 { v0.h }[0], [x9]
+; CHECK: str h0, [x9]
; CHECK-DAG: ldr d[[R:[0-9]+]]
; CHECK-DAG: mov v[[R]].h[1], v0.h[1]
; CHECK-DAG: mov v[[R]].h[2], v0.h[2]
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll?rev=332251&r1=332250&r2=332251&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll Mon May 14 08:26:35 2018
@@ -391,6 +391,15 @@ entry:
ret void
}
+define void @test_vst1q_lane0_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane0_s16:
+; CHECK: str {{h[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <8 x i16> %b, i32 0
+ store i16 %0, i16* %a, align 2
+ ret void
+}
+
define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
; CHECK-LABEL: test_vst1q_lane_s32:
; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
@@ -400,6 +409,15 @@ entry:
ret void
}
+define void @test_vst1q_lane0_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane0_s32:
+; CHECK: str {{s[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <4 x i32> %b, i32 0
+ store i32 %0, i32* %a, align 4
+ ret void
+}
+
define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
; CHECK-LABEL: test_vst1q_lane_s64:
; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
@@ -409,6 +427,15 @@ entry:
ret void
}
+define void @test_vst1q_lane0_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane0_s64:
+; CHECK: str {{d[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <2 x i64> %b, i32 0
+ store i64 %0, i64* %a, align 8
+ ret void
+}
+
define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
; CHECK-LABEL: test_vst1q_lane_f32:
; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
@@ -418,6 +445,15 @@ entry:
ret void
}
+define void @test_vst1q_lane0_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane0_f32:
+; CHECK: str {{s[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <4 x float> %b, i32 0
+ store float %0, float* %a, align 4
+ ret void
+}
+
define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
; CHECK-LABEL: test_vst1q_lane_f64:
; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
@@ -427,6 +463,15 @@ entry:
ret void
}
+define void @test_vst1q_lane0_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane0_f64:
+; CHECK: str {{d[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <2 x double> %b, i32 0
+ store double %0, double* %a, align 8
+ ret void
+}
+
define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
; CHECK-LABEL: test_vst1_lane_s8:
; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
@@ -445,6 +490,15 @@ entry:
ret void
}
+define void @test_vst1_lane0_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane0_s16:
+; CHECK: str {{h[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <4 x i16> %b, i32 0
+ store i16 %0, i16* %a, align 2
+ ret void
+}
+
define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
; CHECK-LABEL: test_vst1_lane_s32:
; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
@@ -454,9 +508,18 @@ entry:
ret void
}
+define void @test_vst1_lane0_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane0_s32:
+; CHECK: str {{s[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <2 x i32> %b, i32 0
+ store i32 %0, i32* %a, align 4
+ ret void
+}
+
define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
; CHECK-LABEL: test_vst1_lane_s64:
-; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+; CHECK: str {{d[0-9]+}}, [x0]
entry:
%0 = extractelement <1 x i64> %b, i32 0
store i64 %0, i64* %a, align 8
@@ -471,6 +534,15 @@ entry:
store float %0, float* %a, align 4
ret void
}
+
+define void @test_vst1_lane0_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane0_f32:
+; CHECK: str {{s[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <2 x float> %b, i32 0
+ store float %0, float* %a, align 4
+ ret void
+}
define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
; CHECK-LABEL: test_vst1_lane_f64:
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll?rev=332251&r1=332250&r2=332251&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll Mon May 14 08:26:35 2018
@@ -10,6 +10,23 @@ define void @st1lane_16b(<16 x i8> %A, i
ret void
}
+define void @st1lane0_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane0_16b
+; CHECK: st1.b
+ %tmp = extractelement <16 x i8> %A, i32 0
+ store i8 %tmp, i8* %D
+ ret void
+}
+
+define void @st1lane0u_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane0u_16b
+; CHECK: st1.b
+ %ptr = getelementptr i8, i8* %D, i64 -1
+ %tmp = extractelement <16 x i8> %A, i32 0
+ store i8 %tmp, i8* %ptr
+ ret void
+}
+
define void @st1lane_ro_16b(<16 x i8> %A, i8* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_16b
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -38,6 +55,23 @@ define void @st1lane_8h(<8 x i16> %A, i1
ret void
}
+define void @st1lane0_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane0_8h
+; CHECK: str
+ %tmp = extractelement <8 x i16> %A, i32 0
+ store i16 %tmp, i16* %D
+ ret void
+}
+
+define void @st1lane0u_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane0u_8h
+; CHECK: st1.h
+ %ptr = getelementptr i16, i16* %D, i64 -1
+ %tmp = extractelement <8 x i16> %A, i32 0
+ store i16 %tmp, i16* %ptr
+ ret void
+}
+
define void @st1lane_ro_8h(<8 x i16> %A, i16* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_8h
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -65,6 +99,23 @@ define void @st1lane_4s(<4 x i32> %A, i3
ret void
}
+define void @st1lane0_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane0_4s
+; CHECK: str
+ %tmp = extractelement <4 x i32> %A, i32 0
+ store i32 %tmp, i32* %D
+ ret void
+}
+
+define void @st1lane0u_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane0u_4s
+; CHECK: st1.s
+ %ptr = getelementptr i32, i32* %D, i64 -1
+ %tmp = extractelement <4 x i32> %A, i32 0
+ store i32 %tmp, i32* %ptr
+ ret void
+}
+
define void @st1lane_ro_4s(<4 x i32> %A, i32* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_4s
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -92,6 +143,23 @@ define void @st1lane_4s_float(<4 x float
ret void
}
+define void @st1lane0_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane0_4s_float
+; CHECK: str
+ %tmp = extractelement <4 x float> %A, i32 0
+ store float %tmp, float* %D
+ ret void
+}
+
+define void @st1lane0u_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane0u_4s_float
+; CHECK: st1.s
+ %ptr = getelementptr float, float* %D, i64 -1
+ %tmp = extractelement <4 x float> %A, i32 0
+ store float %tmp, float* %ptr
+ ret void
+}
+
define void @st1lane_ro_4s_float(<4 x float> %A, float* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_4s_float
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -119,6 +187,23 @@ define void @st1lane_2d(<2 x i64> %A, i6
ret void
}
+define void @st1lane0_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane0_2d
+; CHECK: str
+ %tmp = extractelement <2 x i64> %A, i32 0
+ store i64 %tmp, i64* %D
+ ret void
+}
+
+define void @st1lane0u_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane0u_2d
+; CHECK: st1.d
+ %ptr = getelementptr i64, i64* %D, i64 -1
+ %tmp = extractelement <2 x i64> %A, i32 0
+ store i64 %tmp, i64* %ptr
+ ret void
+}
+
define void @st1lane_ro_2d(<2 x i64> %A, i64* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_2d
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -146,6 +231,23 @@ define void @st1lane_2d_double(<2 x doub
ret void
}
+define void @st1lane0_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane0_2d_double
+; CHECK: str
+ %tmp = extractelement <2 x double> %A, i32 0
+ store double %tmp, double* %D
+ ret void
+}
+
+define void @st1lane0u_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane0u_2d_double
+; CHECK: st1.d
+ %ptr = getelementptr double, double* %D, i64 -1
+ %tmp = extractelement <2 x double> %A, i32 0
+ store double %tmp, double* %ptr
+ ret void
+}
+
define void @st1lane_ro_2d_double(<2 x double> %A, double* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_2d_double
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -201,6 +303,23 @@ define void @st1lane_4h(<4 x i16> %A, i1
ret void
}
+define void @st1lane0_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane0_4h
+; CHECK: str
+ %tmp = extractelement <4 x i16> %A, i32 0
+ store i16 %tmp, i16* %D
+ ret void
+}
+
+define void @st1lane0u_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane0u_4h
+; CHECK: st1.h
+ %ptr = getelementptr i16, i16* %D, i64 -1
+ %tmp = extractelement <4 x i16> %A, i32 0
+ store i16 %tmp, i16* %ptr
+ ret void
+}
+
define void @st1lane_ro_4h(<4 x i16> %A, i16* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_4h
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -228,6 +347,23 @@ define void @st1lane_2s(<2 x i32> %A, i3
ret void
}
+define void @st1lane0_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane0_2s
+; CHECK: str
+ %tmp = extractelement <2 x i32> %A, i32 0
+ store i32 %tmp, i32* %D
+ ret void
+}
+
+define void @st1lane0u_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane0u_2s
+; CHECK: st1.s
+ %ptr = getelementptr i32, i32* %D, i64 -1
+ %tmp = extractelement <2 x i32> %A, i32 0
+ store i32 %tmp, i32* %ptr
+ ret void
+}
+
define void @st1lane_ro_2s(<2 x i32> %A, i32* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_2s
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -255,6 +391,23 @@ define void @st1lane_2s_float(<2 x float
ret void
}
+define void @st1lane0_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane0_2s_float
+; CHECK: str
+ %tmp = extractelement <2 x float> %A, i32 0
+ store float %tmp, float* %D
+ ret void
+}
+
+define void @st1lane0u_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane0u_2s_float
+; CHECK: st1.s
+ %ptr = getelementptr float, float* %D, i64 -1
+ %tmp = extractelement <2 x float> %A, i32 0
+ store float %tmp, float* %ptr
+ ret void
+}
+
define void @st1lane_ro_2s_float(<2 x float> %A, float* %D, i64 %offset) {
; CHECK-LABEL: st1lane_ro_2s_float
; CHECK: add x[[XREG:[0-9]+]], x0, x1
@@ -274,6 +427,58 @@ define void @st1lane0_ro_2s_float(<2 x f
ret void
}
+define void @st1lane0_1d(<1 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane0_1d
+; CHECK: str
+ %tmp = extractelement <1 x i64> %A, i32 0
+ store i64 %tmp, i64* %D
+ ret void
+}
+
+define void @st1lane0u_1d(<1 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane0u_1d
+; CHECK: st1.d
+ %ptr = getelementptr i64, i64* %D, i64 -1
+ %tmp = extractelement <1 x i64> %A, i32 0
+ store i64 %tmp, i64* %ptr
+ ret void
+}
+
+define void @st1lane0_ro_1d(<1 x i64> %A, i64* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_1d
+; CHECK: str d0, [x0, x1, lsl #3]
+ %ptr = getelementptr i64, i64* %D, i64 %offset
+ %tmp = extractelement <1 x i64> %A, i32 0
+ store i64 %tmp, i64* %ptr
+ ret void
+}
+
+define void @st1lane0_1d_double(<1 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane0_1d_double
+; CHECK: str
+ %tmp = extractelement <1 x double> %A, i32 0
+ store double %tmp, double* %D
+ ret void
+}
+
+define void @st1lane0u_1d_double(<1 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane0u_1d_double
+; CHECK: stur
+ %ptr = getelementptr double, double* %D, i64 -1
+ %tmp = extractelement <1 x double> %A, i32 0
+ store double %tmp, double* %ptr
+ ret void
+}
+
+define void @st1lane0_ro_1d_double(<1 x double> %A, double* %D, i64 %offset) {
+; CHECK-LABEL: st1lane0_ro_1d_double
+; CHECK: str d0, [x0, x1, lsl #3]
+ %ptr = getelementptr double, double* %D, i64 %offset
+ %tmp = extractelement <1 x double> %A, i32 0
+ store double %tmp, double* %ptr
+ ret void
+}
+
define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
; CHECK-LABEL: st2lane_16b
; CHECK: st2.b
Modified: llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll?rev=332251&r1=332250&r2=332251&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll Mon May 14 08:26:35 2018
@@ -88,6 +88,45 @@ entry:
ret void
}
+define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_lane0_64:
+; CHECK: str h0, [x0]
+entry:
+ %0 = extractelement <4 x half> %b, i32 0
+ store half %0, half* %a, align 2
+ ret void
+}
+
+define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: storeu_lane0_64:
+; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
+entry:
+ %0 = getelementptr half, half* %a, i64 -1
+ %1 = extractelement <4 x half> %b, i32 0
+ store half %1, half* %0, align 2
+ ret void
+}
+
+define void @storero_lane_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
+; CHECK-LABEL: storero_lane_64:
+; CHECK: st1 { v0.h }[2], [x{{[0-9]+}}]
+entry:
+ %0 = getelementptr half, half* %a, i64 %c
+ %1 = extractelement <4 x half> %b, i32 2
+ store half %1, half* %0, align 2
+ ret void
+}
+
+define void @storero_lane0_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 {
+; CHECK-LABEL: storero_lane0_64:
+; CHECK: str h0, [x0, x1, lsl #1]
+entry:
+ %0 = getelementptr half, half* %a, i64 %c
+ %1 = extractelement <4 x half> %b, i32 0
+ store half %1, half* %0, align 2
+ ret void
+}
+
; Store from one lane of v8f16
define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
; CHECK-LABEL: store_lane_128:
@@ -98,6 +137,45 @@ entry:
ret void
}
+define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_lane0_128:
+; CHECK: str h0, [x0]
+entry:
+ %0 = extractelement <8 x half> %b, i32 0
+ store half %0, half* %a, align 2
+ ret void
+}
+
+define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: storeu_lane0_128:
+; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
+entry:
+ %0 = getelementptr half, half* %a, i64 -1
+ %1 = extractelement <8 x half> %b, i32 0
+ store half %1, half* %0, align 2
+ ret void
+}
+
+define void @storero_lane_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
+; CHECK-LABEL: storero_lane_128:
+; CHECK: st1 { v0.h }[4], [x{{[0-9]+}}]
+entry:
+ %0 = getelementptr half, half* %a, i64 %c
+ %1 = extractelement <8 x half> %b, i32 4
+ store half %1, half* %0, align 2
+ ret void
+}
+
+define void @storero_lane0_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 {
+; CHECK-LABEL: storero_lane0_128:
+; CHECK: str h0, [x0, x1, lsl #1]
+entry:
+ %0 = getelementptr half, half* %a, i64 %c
+ %1 = extractelement <8 x half> %b, i32 0
+ store half %1, half* %0, align 2
+ ret void
+}
+
; NEON intrinsics - (de-)interleaving loads and stores
declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
More information about the llvm-commits
mailing list