[llvm] r332394 - [AArch64] Improve single vector lane unscaled stores

Evandro Menezes via llvm-commits llvm-commits at lists.llvm.org
Tue May 15 13:41:13 PDT 2018


Author: evandro
Date: Tue May 15 13:41:12 2018
New Revision: 332394

URL: http://llvm.org/viewvc/llvm-project?rev=332394&view=rev
Log:
[AArch64] Improve single vector lane unscaled stores

When storing the 0th lane of a vector, use a simpler and usually more
efficient scalar store instead.  In this case, also using the unscaled
offset.

Differential revision: https://reviews.llvm.org/D46762

Modified:
    llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll
    llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td?rev=332394&r1=332393&r2=332394&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td Tue May 15 13:41:12 2018
@@ -2477,6 +2477,22 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_
 def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
   (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
+                             ValueType VTy, ValueType STy,
+                             SubRegIndex SubRegIdx, Instruction STR> {
+  defm : VecStoreLane0Pat<am_unscaled128, StoreOp, VTy, STy, SubRegIdx, simm9, STR>;
+}
+
+let AddedComplexity = 19 in {
+  defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, hsub, STURHi>;
+  defm : VecStoreULane0Pat<store,         v8f16, f16, hsub, STURHi>;
+  defm : VecStoreULane0Pat<store,         v4i32, i32, ssub, STURSi>;
+  defm : VecStoreULane0Pat<store,         v4f32, f32, ssub, STURSi>;
+  defm : VecStoreULane0Pat<store,         v2i64, i64, dsub, STURDi>;
+  defm : VecStoreULane0Pat<store,         v2f64, f64, dsub, STURDi>;
+}
+
 //---
 // STR mnemonics fall back to STUR for negative or unaligned offsets.
 def : InstAlias<"str $Rt, [$Rn, $offset]",

Modified: llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll?rev=332394&r1=332393&r2=332394&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-st1.ll Tue May 15 13:41:12 2018
@@ -4,23 +4,25 @@
 
 define void @st1lane_16b(<16 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane_16b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i8, i8* %D, i64 1
   %tmp = extractelement <16 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
+  store i8 %tmp, i8* %ptr
   ret void
 }
 
 define void @st1lane0_16b(<16 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane0_16b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
+  %ptr = getelementptr i8, i8* %D, i64 1
   %tmp = extractelement <16 x i8> %A, i32 0
-  store i8 %tmp, i8* %D
+  store i8 %tmp, i8* %ptr
   ret void
 }
 
 define void @st1lane0u_16b(<16 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane0u_16b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
   %ptr = getelementptr i8, i8* %D, i64 -1
   %tmp = extractelement <16 x i8> %A, i32 0
   store i8 %tmp, i8* %ptr
@@ -49,23 +51,25 @@ define void @st1lane0_ro_16b(<16 x i8> %
 
 define void @st1lane_8h(<8 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane_8h
-; CHECK: st1.h
+; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i16, i16* %D, i64 1
   %tmp = extractelement <8 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
+  store i16 %tmp, i16* %ptr
   ret void
 }
 
 define void @st1lane0_8h(<8 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane0_8h
-; CHECK: str
+; CHECK: str h0, [x0, #2]
+  %ptr = getelementptr i16, i16* %D, i64 1
   %tmp = extractelement <8 x i16> %A, i32 0
-  store i16 %tmp, i16* %D
+  store i16 %tmp, i16* %ptr
   ret void
 }
 
 define void @st1lane0u_8h(<8 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane0u_8h
-; CHECK: st1.h
+; CHECK: stur h0, [x0, #-2]
   %ptr = getelementptr i16, i16* %D, i64 -1
   %tmp = extractelement <8 x i16> %A, i32 0
   store i16 %tmp, i16* %ptr
@@ -93,23 +97,25 @@ define void @st1lane0_ro_8h(<8 x i16> %A
 
 define void @st1lane_4s(<4 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane_4s
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i32, i32* %D, i64 1
   %tmp = extractelement <4 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
+  store i32 %tmp, i32* %ptr
   ret void
 }
 
 define void @st1lane0_4s(<4 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane0_4s
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+  %ptr = getelementptr i32, i32* %D, i64 1
   %tmp = extractelement <4 x i32> %A, i32 0
-  store i32 %tmp, i32* %D
+  store i32 %tmp, i32* %ptr
   ret void
 }
 
 define void @st1lane0u_4s(<4 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane0u_4s
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
   %ptr = getelementptr i32, i32* %D, i64 -1
   %tmp = extractelement <4 x i32> %A, i32 0
   store i32 %tmp, i32* %ptr
@@ -137,23 +143,25 @@ define void @st1lane0_ro_4s(<4 x i32> %A
 
 define void @st1lane_4s_float(<4 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane_4s_float
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr float, float* %D, i64 1
   %tmp = extractelement <4 x float> %A, i32 1
-  store float %tmp, float* %D
+  store float %tmp, float* %ptr
   ret void
 }
 
 define void @st1lane0_4s_float(<4 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane0_4s_float
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+  %ptr = getelementptr float, float* %D, i64 1
   %tmp = extractelement <4 x float> %A, i32 0
-  store float %tmp, float* %D
+  store float %tmp, float* %ptr
   ret void
 }
 
 define void @st1lane0u_4s_float(<4 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane0u_4s_float
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
   %ptr = getelementptr float, float* %D, i64 -1
   %tmp = extractelement <4 x float> %A, i32 0
   store float %tmp, float* %ptr
@@ -181,23 +189,25 @@ define void @st1lane0_ro_4s_float(<4 x f
 
 define void @st1lane_2d(<2 x i64> %A, i64* %D) {
 ; CHECK-LABEL: st1lane_2d
-; CHECK: st1.d
+; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i64, i64* %D, i64 1
   %tmp = extractelement <2 x i64> %A, i32 1
-  store i64 %tmp, i64* %D
+  store i64 %tmp, i64* %ptr
   ret void
 }
 
 define void @st1lane0_2d(<2 x i64> %A, i64* %D) {
 ; CHECK-LABEL: st1lane0_2d
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+  %ptr = getelementptr i64, i64* %D, i64 1
   %tmp = extractelement <2 x i64> %A, i32 0
-  store i64 %tmp, i64* %D
+  store i64 %tmp, i64* %ptr
   ret void
 }
 
 define void @st1lane0u_2d(<2 x i64> %A, i64* %D) {
 ; CHECK-LABEL: st1lane0u_2d
-; CHECK: st1.d
+; CHECK: stur d0, [x0, #-8]
   %ptr = getelementptr i64, i64* %D, i64 -1
   %tmp = extractelement <2 x i64> %A, i32 0
   store i64 %tmp, i64* %ptr
@@ -225,23 +235,25 @@ define void @st1lane0_ro_2d(<2 x i64> %A
 
 define void @st1lane_2d_double(<2 x double> %A, double* %D) {
 ; CHECK-LABEL: st1lane_2d_double
-; CHECK: st1.d
+; CHECK: st1.d { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr double, double* %D, i64 1
   %tmp = extractelement <2 x double> %A, i32 1
-  store double %tmp, double* %D
+  store double %tmp, double* %ptr
   ret void
 }
 
 define void @st1lane0_2d_double(<2 x double> %A, double* %D) {
 ; CHECK-LABEL: st1lane0_2d_double
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+  %ptr = getelementptr double, double* %D, i64 1
   %tmp = extractelement <2 x double> %A, i32 0
-  store double %tmp, double* %D
+  store double %tmp, double* %ptr
   ret void
 }
 
 define void @st1lane0u_2d_double(<2 x double> %A, double* %D) {
 ; CHECK-LABEL: st1lane0u_2d_double
-; CHECK: st1.d
+; CHECK: stur d0, [x0, #-8]
   %ptr = getelementptr double, double* %D, i64 -1
   %tmp = extractelement <2 x double> %A, i32 0
   store double %tmp, double* %ptr
@@ -269,9 +281,10 @@ define void @st1lane0_ro_2d_double(<2 x
 
 define void @st1lane_8b(<8 x i8> %A, i8* %D) {
 ; CHECK-LABEL: st1lane_8b
-; CHECK: st1.b
+; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i8, i8* %D, i64 1
   %tmp = extractelement <8 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
+  store i8 %tmp, i8* %ptr
   ret void
 }
 
@@ -297,23 +310,25 @@ define void @st1lane0_ro_8b(<8 x i8> %A,
 
 define void @st1lane_4h(<4 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane_4h
-; CHECK: st1.h
+; CHECK: st1.h { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i16, i16* %D, i64 1
   %tmp = extractelement <4 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
+  store i16 %tmp, i16* %ptr
   ret void
 }
 
 define void @st1lane0_4h(<4 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane0_4h
-; CHECK: str
+; CHECK: str h0, [x0, #2]
+  %ptr = getelementptr i16, i16* %D, i64 1
   %tmp = extractelement <4 x i16> %A, i32 0
-  store i16 %tmp, i16* %D
+  store i16 %tmp, i16* %ptr
   ret void
 }
 
 define void @st1lane0u_4h(<4 x i16> %A, i16* %D) {
 ; CHECK-LABEL: st1lane0u_4h
-; CHECK: st1.h
+; CHECK: stur h0, [x0, #-2]
   %ptr = getelementptr i16, i16* %D, i64 -1
   %tmp = extractelement <4 x i16> %A, i32 0
   store i16 %tmp, i16* %ptr
@@ -341,23 +356,25 @@ define void @st1lane0_ro_4h(<4 x i16> %A
 
 define void @st1lane_2s(<2 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane_2s
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr i32, i32* %D, i64 1
   %tmp = extractelement <2 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
+  store i32 %tmp, i32* %ptr
   ret void
 }
 
 define void @st1lane0_2s(<2 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane0_2s
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+  %ptr = getelementptr i32, i32* %D, i64 1
   %tmp = extractelement <2 x i32> %A, i32 0
-  store i32 %tmp, i32* %D
+  store i32 %tmp, i32* %ptr
   ret void
 }
 
 define void @st1lane0u_2s(<2 x i32> %A, i32* %D) {
 ; CHECK-LABEL: st1lane0u_2s
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
   %ptr = getelementptr i32, i32* %D, i64 -1
   %tmp = extractelement <2 x i32> %A, i32 0
   store i32 %tmp, i32* %ptr
@@ -385,23 +402,25 @@ define void @st1lane0_ro_2s(<2 x i32> %A
 
 define void @st1lane_2s_float(<2 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane_2s_float
-; CHECK: st1.s
+; CHECK: st1.s { v0 }[1], [x{{[0-9]+}}]
+  %ptr = getelementptr float, float* %D, i64 1
   %tmp = extractelement <2 x float> %A, i32 1
-  store float %tmp, float* %D
+  store float %tmp, float* %ptr
   ret void
 }
 
 define void @st1lane0_2s_float(<2 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane0_2s_float
-; CHECK: str
+; CHECK: str s0, [x0, #4]
+  %ptr = getelementptr float, float* %D, i64 1
   %tmp = extractelement <2 x float> %A, i32 0
-  store float %tmp, float* %D
+  store float %tmp, float* %ptr
   ret void
 }
 
 define void @st1lane0u_2s_float(<2 x float> %A, float* %D) {
 ; CHECK-LABEL: st1lane0u_2s_float
-; CHECK: st1.s
+; CHECK: stur s0, [x0, #-4]
   %ptr = getelementptr float, float* %D, i64 -1
   %tmp = extractelement <2 x float> %A, i32 0
   store float %tmp, float* %ptr
@@ -429,15 +448,16 @@ define void @st1lane0_ro_2s_float(<2 x f
 
 define void @st1lane0_1d(<1 x i64> %A, i64* %D) {
 ; CHECK-LABEL: st1lane0_1d
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+  %ptr = getelementptr i64, i64* %D, i64 1
   %tmp = extractelement <1 x i64> %A, i32 0
-  store i64 %tmp, i64* %D
+  store i64 %tmp, i64* %ptr
   ret void
 }
 
 define void @st1lane0u_1d(<1 x i64> %A, i64* %D) {
 ; CHECK-LABEL: st1lane0u_1d
-; CHECK: st1.d
+; CHECK: stur d0, [x0, #-8]
   %ptr = getelementptr i64, i64* %D, i64 -1
   %tmp = extractelement <1 x i64> %A, i32 0
   store i64 %tmp, i64* %ptr
@@ -455,15 +475,16 @@ define void @st1lane0_ro_1d(<1 x i64> %A
 
 define void @st1lane0_1d_double(<1 x double> %A, double* %D) {
 ; CHECK-LABEL: st1lane0_1d_double
-; CHECK: str
+; CHECK: str d0, [x0, #8]
+  %ptr = getelementptr double, double* %D, i64 1
   %tmp = extractelement <1 x double> %A, i32 0
-  store double %tmp, double* %D
+  store double %tmp, double* %ptr
   ret void
 }
 
 define void @st1lane0u_1d_double(<1 x double> %A, double* %D) {
 ; CHECK-LABEL: st1lane0u_1d_double
-; CHECK: stur
+; CHECK: stur d0, [x0, #-8]
   %ptr = getelementptr double, double* %D, i64 -1
   %tmp = extractelement <1 x double> %A, i32 0
   store double %tmp, double* %ptr

Modified: llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll?rev=332394&r1=332393&r2=332394&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/fp16-vector-load-store.ll Tue May 15 13:41:12 2018
@@ -99,7 +99,7 @@ entry:
 
 define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 {
 ; CHECK-LABEL: storeu_lane0_64:
-; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
+; CHECK: stur h0, [x{{[0-9]+}}, #-2]
 entry:
   %0 = getelementptr half, half* %a, i64 -1
   %1 = extractelement <4 x half> %b, i32 0
@@ -148,7 +148,7 @@ entry:
 
 define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 {
 ; CHECK-LABEL: storeu_lane0_128:
-; CHECK: st1 { v0.h }[0], [x{{[0-9]+}}]
+; CHECK: stur h0, [x{{[0-9]+}}, #-2]
 entry:
   %0 = getelementptr half, half* %a, i64 -1
   %1 = extractelement <8 x half> %b, i32 0




More information about the llvm-commits mailing list