[llvm] r332394 - [AArch64] Improve single vector lane unscaled stores
Evandro Menezes via llvm-commits
llvm-commits at lists.llvm.org
Tue May 15 13:41:13 PDT 2018
Author: evandro
Date: Tue May 15 13:41:12 2018
New Revision: 332394
URL: http://llvm.org/viewvc/llvm-project?rev=332394&view=rev
Log:
[AArch64] Improve single vector lane unscaled stores
When storing the 0th lane of a vector, use a simpler and usually more
efficient scalar store instead. In this case, also using the unscaled
offset.
Differential revision: https://reviews.llvm.org/D46762
Modified:
llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll
llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=332394&r1=332393&r2=332394&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Tue May 15 13:41:12 2018
@@ -2477,6 +2477,22 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
+ ValueType VTy, ValueType STy,
+ SubRegIndex SubRegIdx, Instruction STR> {
+ defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
+}
+
+let AddedComplexity = 19 in {
+ defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
+ defm : VecStoreULane0Pat<store, v8f16, f16, hsub, STURHi>;
+ defm : VecStoreULane0Pat<store, v4i32, i32, ssub, STURSi>;
+ defm : VecStoreULane0Pat<store, v4f32, f32, ssub, STURSi>;
+ defm : VecStoreULane0Pat<store, v2i64, i64, dsub, STURDi>;
+ defm : VecStoreULane0Pat<store, v2f64, f64, dsub, STURDi>;
+}
+
//---
// STR mnemonics fall back to STUR for negative or unaligned offsets.
def : InstAlias<"str $Rt, [$Rn, $offset]",
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll?rev=332394&r1=332393&r2=332394&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll Tue May 15 13:41:12 2018
@@ -4,23 +4,25 @@
define void @st1lane_16b(<16 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane_16b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i8, i8* %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 1
- store i8 %tmp, i8* %D
+ store i8 %tmp, i8* %ptr
ret void
}
define void @st1lane0_16b(<16 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane0_16b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
+ %ptr = getelementptr i8, i8* %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 0
- store i8 %tmp, i8* %D
+ store i8 %tmp, i8* %ptr
ret void
}
define void @st1lane0u_16b(<16 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane0u_16b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
%ptr = getelementptr i8, i8* %D, i64 -1
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, i8* %ptr
@@ -49,23 +51,25 @@ define void @st1lane0_ro_16b(<16 x i8> %
define void @st1lane_8h(<8 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane_8h
-; CHECK: st1.h
+; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <8 x i16> %A, i32 1
- store i16 %tmp, i16* %D
+ store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0_8h(<8 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0_8h
-; CHECK: str
+; CHECK: str h0, [x0, #2]
+ %ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <8 x i16> %A, i32 0
- store i16 %tmp, i16* %D
+ store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0u_8h(<8 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0u_8h
-; CHECK: st1.h
+; CHECK: stur h0, [x0, #-2]
%ptr = getelementptr i16, i16* %D, i64 -1
%tmp = extractelement <8 x i16> %A, i32 0
store i16 %tmp, i16* %ptr
@@ -93,23 +97,25 @@ define void @st1lane0_ro_8h(<8 x i16> %A
define void @st1lane_4s(<4 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane_4s
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <4 x i32> %A, i32 1
- store i32 %tmp, i32* %D
+ store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0_4s(<4 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0_4s
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+ %ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <4 x i32> %A, i32 0
- store i32 %tmp, i32* %D
+ store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0u_4s(<4 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0u_4s
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr i32, i32* %D, i64 -1
%tmp = extractelement <4 x i32> %A, i32 0
store i32 %tmp, i32* %ptr
@@ -137,23 +143,25 @@ define void @st1lane0_ro_4s(<4 x i32> %A
define void @st1lane_4s_float(<4 x float> %A, float* %D) {
; CHECK-LABEL: st1lane_4s_float
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <4 x float> %A, i32 1
- store float %tmp, float* %D
+ store float %tmp, float* %ptr
ret void
}
define void @st1lane0_4s_float(<4 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0_4s_float
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+ %ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <4 x float> %A, i32 0
- store float %tmp, float* %D
+ store float %tmp, float* %ptr
ret void
}
define void @st1lane0u_4s_float(<4 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0u_4s_float
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr float, float* %D, i64 -1
%tmp = extractelement <4 x float> %A, i32 0
store float %tmp, float* %ptr
@@ -181,23 +189,25 @@ define void @st1lane0_ro_4s_float(<4 x f
define void @st1lane_2d(<2 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane_2d
-; CHECK: st1.d
+; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i64, i64* %D, i64 1
%tmp = extractelement <2 x i64> %A, i32 1
- store i64 %tmp, i64* %D
+ store i64 %tmp, i64* %ptr
ret void
}
define void @st1lane0_2d(<2 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0_2d
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+ %ptr = getelementptr i64, i64* %D, i64 1
%tmp = extractelement <2 x i64> %A, i32 0
- store i64 %tmp, i64* %D
+ store i64 %tmp, i64* %ptr
ret void
}
define void @st1lane0u_2d(<2 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0u_2d
-; CHECK: st1.d
+; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr i64, i64* %D, i64 -1
%tmp = extractelement <2 x i64> %A, i32 0
store i64 %tmp, i64* %ptr
@@ -225,23 +235,25 @@ define void @st1lane0_ro_2d(<2 x i64> %A
define void @st1lane_2d_double(<2 x double> %A, double* %D) {
; CHECK-LABEL: st1lane_2d_double
-; CHECK: st1.d
+; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr double, double* %D, i64 1
%tmp = extractelement <2 x double> %A, i32 1
- store double %tmp, double* %D
+ store double %tmp, double* %ptr
ret void
}
define void @st1lane0_2d_double(<2 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0_2d_double
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+ %ptr = getelementptr double, double* %D, i64 1
%tmp = extractelement <2 x double> %A, i32 0
- store double %tmp, double* %D
+ store double %tmp, double* %ptr
ret void
}
define void @st1lane0u_2d_double(<2 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0u_2d_double
-; CHECK: st1.d
+; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr double, double* %D, i64 -1
%tmp = extractelement <2 x double> %A, i32 0
store double %tmp, double* %ptr
@@ -269,9 +281,10 @@ define void @st1lane0_ro_2d_double(<2 x
define void @st1lane_8b(<8 x i8> %A, i8* %D) {
; CHECK-LABEL: st1lane_8b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i8, i8* %D, i64 1
%tmp = extractelement <8 x i8> %A, i32 1
- store i8 %tmp, i8* %D
+ store i8 %tmp, i8* %ptr
ret void
}
@@ -297,23 +310,25 @@ define void @st1lane0_ro_8b(<8 x i8> %A,
define void @st1lane_4h(<4 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane_4h
-; CHECK: st1.h
+; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <4 x i16> %A, i32 1
- store i16 %tmp, i16* %D
+ store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0_4h(<4 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0_4h
-; CHECK: str
+; CHECK: str h0, [x0, #2]
+ %ptr = getelementptr i16, i16* %D, i64 1
%tmp = extractelement <4 x i16> %A, i32 0
- store i16 %tmp, i16* %D
+ store i16 %tmp, i16* %ptr
ret void
}
define void @st1lane0u_4h(<4 x i16> %A, i16* %D) {
; CHECK-LABEL: st1lane0u_4h
-; CHECK: st1.h
+; CHECK: stur h0, [x0, #-2]
%ptr = getelementptr i16, i16* %D, i64 -1
%tmp = extractelement <4 x i16> %A, i32 0
store i16 %tmp, i16* %ptr
@@ -341,23 +356,25 @@ define void @st1lane0_ro_4h(<4 x i16> %A
define void @st1lane_2s(<2 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane_2s
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <2 x i32> %A, i32 1
- store i32 %tmp, i32* %D
+ store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0_2s(<2 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0_2s
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+ %ptr = getelementptr i32, i32* %D, i64 1
%tmp = extractelement <2 x i32> %A, i32 0
- store i32 %tmp, i32* %D
+ store i32 %tmp, i32* %ptr
ret void
}
define void @st1lane0u_2s(<2 x i32> %A, i32* %D) {
; CHECK-LABEL: st1lane0u_2s
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr i32, i32* %D, i64 -1
%tmp = extractelement <2 x i32> %A, i32 0
store i32 %tmp, i32* %ptr
@@ -385,23 +402,25 @@ define void @st1lane0_ro_2s(<2 x i32> %A
define void @st1lane_2s_float(<2 x float> %A, float* %D) {
; CHECK-LABEL: st1lane_2s_float
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+ %ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <2 x float> %A, i32 1
- store float %tmp, float* %D
+ store float %tmp, float* %ptr
ret void
}
define void @st1lane0_2s_float(<2 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0_2s_float
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+ %ptr = getelementptr float, float* %D, i64 1
%tmp = extractelement <2 x float> %A, i32 0
- store float %tmp, float* %D
+ store float %tmp, float* %ptr
ret void
}
define void @st1lane0u_2s_float(<2 x float> %A, float* %D) {
; CHECK-LABEL: st1lane0u_2s_float
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
%ptr = getelementptr float, float* %D, i64 -1
%tmp = extractelement <2 x float> %A, i32 0
store float %tmp, float* %ptr
@@ -429,15 +448,16 @@ define void @st1lane0_ro_2s_float(<2 x f
define void @st1lane0_1d(<1 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0_1d
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+ %ptr = getelementptr i64, i64* %D, i64 1
%tmp = extractelement <1 x i64> %A, i32 0
- store i64 %tmp, i64* %D
+ store i64 %tmp, i64* %ptr
ret void
}
define void @st1lane0u_1d(<1 x i64> %A, i64* %D) {
; CHECK-LABEL: st1lane0u_1d
-; CHECK: st1.d
+; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr i64, i64* %D, i64 -1
%tmp = extractelement <1 x i64> %A, i32 0
store i64 %tmp, i64* %ptr
@@ -455,15 +475,16 @@ define void @st1lane0_ro_1d(<1 x i64> %A
define void @st1lane0_1d_double(<1 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0_1d_double
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+ %ptr = getelementptr double, double* %D, i64 1
%tmp = extractelement <1 x double> %A, i32 0
- store double %tmp, double* %D
+ store double %tmp, double* %ptr
ret void
}
define void @st1lane0u_1d_double(<1 x double> %A, double* %D) {
; CHECK-LABEL: st1lane0u_1d_double
-; CHECK: stur
+; CHECK: stur d0, [x0, #-8]
%ptr = getelementptr double, double* %D, i64 -1
%tmp = extractelement <1 x double> %A, i32 0
store double %tmp, double* %ptr
Modified: llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll?rev=332394&r1=332393&r2=332394&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll Tue May 15 13:41:12 2018
@@ -99,7 +99,7 @@ entry:
define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
; CHECK-LABEL: storeu_lane0_64:
-; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
+; CHECK: stur h0, [x{{[0-9]+}}, #-2]
entry:
%0 = getelementptr half, half* %a, i64 -1
%1 = extractelement <4 x half> %b, i32 0
@@ -148,7 +148,7 @@ entry:
define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
; CHECK-LABEL: storeu_lane0_128:
-; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
+; CHECK: stur h0, [x{{[0-9]+}}, #-2]
entry:
%0 = getelementptr half, half* %a, i64 -1
%1 = extractelement <8 x half> %b, i32 0
More information about the llvm-commits
mailing list