[llvm] r205294 - ARM64: add patterns for more lane-wise ld1/st1 operations.

Tue Apr 1 03:37:10 PDT 2014

Author: tnorthover
Date: Tue Apr  1 05:37:09 2014
New Revision: 205294

URL: http://llvm.org/viewvc/llvm-project?rev=205294&view=rev
Log:
ARM64: add patterns for more lane-wise ld1/st1 operations.

Modified:
    llvm/trunk/lib/Target/ARM64/ARM64InstrFormats.td
    llvm/trunk/lib/Target/ARM64/ARM64InstrInfo.td
    llvm/trunk/test/CodeGen/ARM64/ld1.ll
    llvm/trunk/test/CodeGen/ARM64/st1.ll

Modified: llvm/trunk/lib/Target/ARM64/ARM64InstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/ARM64InstrFormats.td?rev=205294&r1=205293&r2=205294&view=diff
==============================================================================

--- llvm/trunk/lib/Target/ARM64/ARM64InstrFormats.td (original)
+++ llvm/trunk/lib/Target/ARM64/ARM64InstrFormats.td Tue Apr  1 05:37:09 2014
@@ -7971,8 +7971,7 @@ multiclass SIMDLdSingleSTied<bit R, bits
 }
 let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
 multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
   def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
                             (outs listtype:$dst),
                             (ins listtype:$Vt, VectorIndexD:$idx,
@@ -7985,12 +7984,10 @@ multiclass SIMDLdSingleDTied<bit R, bits
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
   def i8 : SIMDLdStSingleB<0, R, opcode, asm,
                            (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                        am_simdnoindex:$vaddr),
-                           pattern>;
+                                        am_simdnoindex:$vaddr), []>;
 
   def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
                             (outs), (ins listtype:$Vt, VectorIndexB:$idx,
@@ -7998,12 +7995,10 @@ multiclass SIMDStSingleB<bit R, bits<3>
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
   def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
                             (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr),
-                        pattern>;
+                                         am_simdnoindex:$vaddr), []>;
 
   def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
                             (outs), (ins listtype:$Vt, VectorIndexH:$idx,
@@ -8011,12 +8006,10 @@ multiclass SIMDStSingleH<bit R, bits<3>
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
   def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
                             (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr),
-                            pattern>;
+                                         am_simdnoindex:$vaddr), []>;
 
   def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
                             (outs), (ins listtype:$Vt, VectorIndexS:$idx,
@@ -8024,11 +8017,10 @@ multiclass SIMDStSingleS<bit R, bits<3>
 }
 let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
 multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, list<dag> pattern,
-                         RegisterOperand GPR64pi> {
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
   def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
                             (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr), pattern>;
+                                         am_simdnoindex:$vaddr), []>;
 
   def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
                             (outs), (ins listtype:$Vt, VectorIndexD:$idx,

Modified: llvm/trunk/lib/Target/ARM64/ARM64InstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM64/ARM64InstrInfo.td?rev=205294&r1=205293&r2=205294&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM64/ARM64InstrInfo.td (original)
+++ llvm/trunk/lib/Target/ARM64/ARM64InstrInfo.td Tue Apr  1 05:37:09 2014
@@ -4087,18 +4087,32 @@ def : Pat<(v2f64 (ARM64dup (f64 (load am
 def : Pat<(v1f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
           (LD1Rv1d am_simdnoindex:$vaddr)>;
 
-def : Pat<(vector_insert (v16i8 VecListOne128:$Rd),
-            (i32 (extloadi8 am_simdnoindex:$vaddr)), VectorIndexB:$idx),
-          (LD1i8 VecListOne128:$Rd, VectorIndexB:$idx, am_simdnoindex:$vaddr)>;
-def : Pat<(vector_insert (v8i16 VecListOne128:$Rd),
-            (i32 (extloadi16 am_simdnoindex:$vaddr)), VectorIndexH:$idx),
-          (LD1i16 VecListOne128:$Rd, VectorIndexH:$idx, am_simdnoindex:$vaddr)>;
-def : Pat<(vector_insert (v4i32 VecListOne128:$Rd),
-            (i32 (load am_simdnoindex:$vaddr)), VectorIndexS:$idx),
-          (LD1i32 VecListOne128:$Rd, VectorIndexS:$idx, am_simdnoindex:$vaddr)>;
-def : Pat<(vector_insert (v2i64 VecListOne128:$Rd),
-            (i64 (load am_simdnoindex:$vaddr)), VectorIndexD:$idx),
-          (LD1i64 VecListOne128:$Rd, VectorIndexD:$idx, am_simdnoindex:$vaddr)>;
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, am_simdnoindex:$vaddr),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
 
 
 defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@@ -4107,38 +4121,53 @@ defm LD3 : SIMDLdSt3SingleAliases<"ld3">
 defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
 
 // Stores
-let AddedComplexity = 8 in {
-defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb,
-  [(truncstorei8
-      (i32 (vector_extract (v16i8 VecListOneb:$Vt), VectorIndexB:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi1>;
-defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh,
-  [(truncstorei16
-      (i32 (vector_extract (v8i16 VecListOneh:$Vt), VectorIndexH:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi2>;
-defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes,
-  [(store
-      (i32 (vector_extract (v4i32 VecListOnes:$Vt), VectorIndexS:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi4>;
-defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned,
-  [(store
-      (i64 (vector_extract (v2i64 VecListOned:$Vt), VectorIndexD:$idx)),
-      am_simdnoindex:$vaddr)], GPR64pi8>;
-}
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 8 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             am_simdnoindex:$vaddr),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 8 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             am_simdnoindex:$vaddr),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, am_simdnoindex:$vaddr)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
 
 let mayStore = 1, neverHasSideEffects = 1 in {
-defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   [], GPR64pi2>;
-defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   [], GPR64pi4>;
-defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   [], GPR64pi8>;
-defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   [], GPR64pi16>;
-defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, [], GPR64pi3>;
-defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, [], GPR64pi6>;
-defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, [], GPR64pi12>;
-defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, [], GPR64pi24>;
-defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  [], GPR64pi4>;
-defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  [], GPR64pi8>;
-defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  [], GPR64pi16>;
-defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  [], GPR64pi32>;
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
 }
 
 defm ST1 : SIMDLdSt1SingleAliases<"st1">;

Modified: llvm/trunk/test/CodeGen/ARM64/ld1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/ld1.ll?rev=205294&r1=205293&r2=205294&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/ld1.ll (original)
+++ llvm/trunk/test/CodeGen/ARM64/ld1.ll Tue Apr  1 05:37:09 2014
@@ -5,7 +5,7 @@
 %struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
 
 define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
-; CHECK: ld2_8b
+; CHECK-LABEL: ld2_8b
 ; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
 ; and from the argument of the function also defined by ABI (i.e., x0)
 ; CHECK ld2.8b { v0, v1 }, [x0]
@@ -15,7 +15,7 @@ define %struct.__neon_int8x8x2_t @ld2_8b
 }
 
 define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
-; CHECK: ld3_8b
+; CHECK-LABEL: ld3_8b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -24,7 +24,7 @@ define %struct.__neon_int8x8x3_t @ld3_8b
 }
 
 define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
-; CHECK: ld4_8b
+; CHECK-LABEL: ld4_8b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -41,7 +41,7 @@ declare %struct.__neon_int8x8x4_t @llvm.
 %struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
 
 define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
-; CHECK: ld2_16b
+; CHECK-LABEL: ld2_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -50,7 +50,7 @@ define %struct.__neon_int8x16x2_t @ld2_1
 }
 
 define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
-; CHECK: ld3_16b
+; CHECK-LABEL: ld3_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -59,7 +59,7 @@ define %struct.__neon_int8x16x3_t @ld3_1
 }
 
 define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
-; CHECK: ld4_16b
+; CHECK-LABEL: ld4_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -76,7 +76,7 @@ declare %struct.__neon_int8x16x4_t @llvm
 %struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
 
 define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
-; CHECK: ld2_4h
+; CHECK-LABEL: ld2_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -85,7 +85,7 @@ define %struct.__neon_int16x4x2_t @ld2_4
 }
 
 define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
-; CHECK: ld3_4h
+; CHECK-LABEL: ld3_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -94,7 +94,7 @@ define %struct.__neon_int16x4x3_t @ld3_4
 }
 
 define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
-; CHECK: ld4_4h
+; CHECK-LABEL: ld4_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -111,7 +111,7 @@ declare %struct.__neon_int16x4x4_t @llvm
 %struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
 
 define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
-; CHECK: ld2_8h
+; CHECK-LABEL: ld2_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -120,7 +120,7 @@ define %struct.__neon_int16x8x2_t @ld2_8
 }
 
 define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
-; CHECK: ld3_8h
+; CHECK-LABEL: ld3_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -129,7 +129,7 @@ define %struct.__neon_int16x8x3_t @ld3_8
 }
 
 define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
-; CHECK: ld4_8h
+; CHECK-LABEL: ld4_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -146,7 +146,7 @@ declare %struct.__neon_int16x8x4_t @llvm
 %struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
 
 define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
-; CHECK: ld2_2s
+; CHECK-LABEL: ld2_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -155,7 +155,7 @@ define %struct.__neon_int32x2x2_t @ld2_2
 }
 
 define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
-; CHECK: ld3_2s
+; CHECK-LABEL: ld3_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -164,7 +164,7 @@ define %struct.__neon_int32x2x3_t @ld3_2
 }
 
 define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
-; CHECK: ld4_2s
+; CHECK-LABEL: ld4_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -181,7 +181,7 @@ declare %struct.__neon_int32x2x4_t @llvm
 %struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
 
 define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
-; CHECK: ld2_4s
+; CHECK-LABEL: ld2_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -190,7 +190,7 @@ define %struct.__neon_int32x4x2_t @ld2_4
 }
 
 define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
-; CHECK: ld3_4s
+; CHECK-LABEL: ld3_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -199,7 +199,7 @@ define %struct.__neon_int32x4x3_t @ld3_4
 }
 
 define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
-; CHECK: ld4_4s
+; CHECK-LABEL: ld4_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -216,7 +216,7 @@ declare %struct.__neon_int32x4x4_t @llvm
 %struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
 
 define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
-; CHECK: ld2_2d
+; CHECK-LABEL: ld2_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -225,7 +225,7 @@ define %struct.__neon_int64x2x2_t @ld2_2
 }
 
 define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
-; CHECK: ld3_2d
+; CHECK-LABEL: ld3_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -234,7 +234,7 @@ define %struct.__neon_int64x2x3_t @ld3_2
 }
 
 define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
-; CHECK: ld4_2d
+; CHECK-LABEL: ld4_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -252,7 +252,7 @@ declare %struct.__neon_int64x2x4_t @llvm
 
 
 define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
-; CHECK: ld2_1di64
+; CHECK-LABEL: ld2_1di64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -261,7 +261,7 @@ define %struct.__neon_int64x1x2_t @ld2_1
 }
 
 define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
-; CHECK: ld3_1di64
+; CHECK-LABEL: ld3_1di64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -270,7 +270,7 @@ define %struct.__neon_int64x1x3_t @ld3_1
 }
 
 define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
-; CHECK: ld4_1di64
+; CHECK-LABEL: ld4_1di64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -289,7 +289,7 @@ declare %struct.__neon_int64x1x4_t @llvm
 
 
 define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
-; CHECK: ld2_1df64
+; CHECK-LABEL: ld2_1df64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
@@ -298,7 +298,7 @@ define %struct.__neon_float64x1x2_t @ld2
 }
 
 define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
-; CHECK: ld3_1df64
+; CHECK-LABEL: ld3_1df64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
@@ -307,7 +307,7 @@ define %struct.__neon_float64x1x3_t @ld3
 }
 
 define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
-; CHECK: ld4_1df64
+; CHECK-LABEL: ld4_1df64
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
@@ -800,7 +800,7 @@ declare %struct.__neon_int64x2x3_t @llvm
 declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
 
 define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
-; CHECK: ld1_16b
+; CHECK-LABEL: ld1_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.b { v0 }[0], [x0]
 ; CHECK-NEXT ret
@@ -810,7 +810,7 @@ define <16 x i8> @ld1_16b(<16 x i8> %V,
 }
 
 define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
-; CHECK: ld1_8h
+; CHECK-LABEL: ld1_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.h { v0 }[0], [x0]
 ; CHECK-NEXT ret
@@ -820,7 +820,7 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, i
 }
 
 define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
-; CHECK: ld1_4s
+; CHECK-LABEL: ld1_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.s { v0 }[0], [x0]
 ; CHECK-NEXT ret
@@ -829,8 +829,18 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, i
   ret <4 x i32> %tmp2
 }
 
+define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_4s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
+  ret <4 x float> %tmp2
+}
+
 define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
-; CHECK: ld1_2d
+; CHECK-LABEL: ld1_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1.d { v0 }[0], [x0]
 ; CHECK-NEXT ret
@@ -839,8 +849,18 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, i
   ret <2 x i64> %tmp2
 }
 
+define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
+; CHECK-LABEL: ld1_2d_double:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load double* %bar
+  %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
+  ret <2 x double> %tmp2
+}
+
 define <1 x i64> @ld1_1d(<1 x i64>* %p) {
-; CHECK: ld1_1d
+; CHECK-LABEL: ld1_1d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ldr [[REG:d[0-9]+]], [x0]
 ; CHECK-NEXT: ret
@@ -848,6 +868,46 @@ define <1 x i64> @ld1_1d(<1 x i64>* %p)
   ret <1 x i64> %tmp
 }
 
+define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
+  ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
+  ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_2s:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
+  ret <2 x i32> %tmp2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_2s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load float* %bar
+  %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
+  ret <2 x float> %tmp2
+}
+
 
 ; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
 define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
@@ -882,7 +942,7 @@ entry:
 ; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
 define <4 x float> @ld1r_4s_float(float* nocapture %x) {
 entry:
-; CHECK: ld1r_4s_float
+; CHECK-LABEL: ld1r_4s_float
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.4s { v0 }, [x0]
 ; CHECK-NEXT ret
@@ -896,7 +956,7 @@ entry:
 
 define <2 x float> @ld1r_2s_float(float* nocapture %x) {
 entry:
-; CHECK: ld1r_2s_float
+; CHECK-LABEL: ld1r_2s_float
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2s { v0 }, [x0]
 ; CHECK-NEXT ret
@@ -908,7 +968,7 @@ entry:
 
 define <2 x double> @ld1r_2d_double(double* nocapture %x) {
 entry:
-; CHECK: ld1r_2d_double
+; CHECK-LABEL: ld1r_2d_double
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2d { v0 }, [x0]
 ; CHECK-NEXT ret
@@ -920,7 +980,7 @@ entry:
 
 define <1 x double> @ld1r_1d_double(double* nocapture %x) {
 entry:
-; CHECK: ld1r_1d_double
+; CHECK-LABEL: ld1r_1d_double
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ldr d0, [x0]
 ; CHECK-NEXT ret
@@ -931,7 +991,7 @@ entry:
 
 define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
 entry:
-; CHECK: ld1r_4s_float_shuff
+; CHECK-LABEL: ld1r_4s_float_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.4s { v0 }, [x0]
 ; CHECK-NEXT ret
@@ -943,7 +1003,7 @@ entry:
 
 define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
 entry:
-; CHECK: ld1r_2s_float_shuff
+; CHECK-LABEL: ld1r_2s_float_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2s { v0 }, [x0]
 ; CHECK-NEXT ret
@@ -955,7 +1015,7 @@ entry:
 
 define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
 entry:
-; CHECK: ld1r_2d_double_shuff
+; CHECK-LABEL: ld1r_2d_double_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld1r.2d { v0 }, [x0]
 ; CHECK-NEXT ret
@@ -967,7 +1027,7 @@ entry:
 
 define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
 entry:
-; CHECK: ld1r_1d_double_shuff
+; CHECK-LABEL: ld1r_1d_double_shuff
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ldr d0, [x0]
 ; CHECK-NEXT ret

Modified: llvm/trunk/test/CodeGen/ARM64/st1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM64/st1.ll?rev=205294&r1=205293&r2=205294&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM64/st1.ll (original)
+++ llvm/trunk/test/CodeGen/ARM64/st1.ll Tue Apr  1 05:37:09 2014
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 define void @st1lane_16b(<16 x i8> %A, i8* %D) {
-; CHECK: st1lane_16b
+; CHECK-LABEL: st1lane_16b
 ; CHECK: st1.b
   %tmp = extractelement <16 x i8> %A, i32 1
   store i8 %tmp, i8* %D
@@ -9,7 +9,7 @@ define void @st1lane_16b(<16 x i8> %A, i
 }
 
 define void @st1lane_8h(<8 x i16> %A, i16* %D) {
-; CHECK: st1lane_8h
+; CHECK-LABEL: st1lane_8h
 ; CHECK: st1.h
   %tmp = extractelement <8 x i16> %A, i32 1
   store i16 %tmp, i16* %D
@@ -17,44 +17,92 @@ define void @st1lane_8h(<8 x i16> %A, i1
 }
 
 define void @st1lane_4s(<4 x i32> %A, i32* %D) {
-; CHECK: st1lane_4s
+; CHECK-LABEL: st1lane_4s
 ; CHECK: st1.s
   %tmp = extractelement <4 x i32> %A, i32 1
   store i32 %tmp, i32* %D
   ret void
 }
 
+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
 define void @st1lane_2d(<2 x i64> %A, i64* %D) {
-; CHECK: st1lane_2d
+; CHECK-LABEL: st1lane_2d
 ; CHECK: st1.d
   %tmp = extractelement <2 x i64> %A, i32 1
   store i64 %tmp, i64* %D
   ret void
 }
 
+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %D
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
 define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
-; CHECK: st2lane_16b
+; CHECK-LABEL: st2lane_16b
 ; CHECK: st2.b
   call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
   ret void
 }
 
 define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
-; CHECK: st2lane_8h
+; CHECK-LABEL: st2lane_8h
 ; CHECK: st2.h
   call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
   ret void
 }
 
 define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
-; CHECK: st2lane_4s
+; CHECK-LABEL: st2lane_4s
 ; CHECK: st2.s
   call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
   ret void
 }
 
 define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
-; CHECK: st2lane_2d
+; CHECK-LABEL: st2lane_2d
 ; CHECK: st2.d
   call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
   ret void
@@ -66,28 +114,28 @@ declare void @llvm.arm64.neon.st2lane.v4
 declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
-; CHECK: st3lane_16b
+; CHECK-LABEL: st3lane_16b
 ; CHECK: st3.b
   call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
   ret void
 }
 
 define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
-; CHECK: st3lane_8h
+; CHECK-LABEL: st3lane_8h
 ; CHECK: st3.h
   call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
   ret void
 }
 
 define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
-; CHECK: st3lane_4s
+; CHECK-LABEL: st3lane_4s
 ; CHECK: st3.s
   call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
   ret void
 }
 
 define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
-; CHECK: st3lane_2d
+; CHECK-LABEL: st3lane_2d
 ; CHECK: st3.d
   call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
   ret void
@@ -99,28 +147,28 @@ declare void @llvm.arm64.neon.st3lane.v4
 declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
 
 define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
-; CHECK: st4lane_16b
+; CHECK-LABEL: st4lane_16b
 ; CHECK: st4.b
   call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
   ret void
 }
 
 define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
-; CHECK: st4lane_8h
+; CHECK-LABEL: st4lane_8h
 ; CHECK: st4.h
   call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
   ret void
 }
 
 define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
-; CHECK: st4lane_4s
+; CHECK-LABEL: st4lane_4s
 ; CHECK: st4.s
   call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
   ret void
 }
 
 define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
-; CHECK: st4lane_2d
+; CHECK-LABEL: st4lane_2d
 ; CHECK: st4.d
   call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
   ret void
@@ -133,21 +181,21 @@ declare void @llvm.arm64.neon.st4lane.v2
 
 
 define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
-; CHECK: st2_8b
+; CHECK-LABEL: st2_8b
 ; CHECK st2.8b
 	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
 	ret void
 }
 
 define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
-; CHECK: st3_8b
+; CHECK-LABEL: st3_8b
 ; CHECK st3.8b
 	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
 	ret void
 }
 
 define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
-; CHECK: st4_8b
+; CHECK-LABEL: st4_8b
 ; CHECK st4.8b
 	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
 	ret void
@@ -158,21 +206,21 @@ declare void @llvm.arm64.neon.st3.v8i8.p
 declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
 
 define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
-; CHECK: st2_16b
+; CHECK-LABEL: st2_16b
 ; CHECK st2.16b
 	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
 	ret void
 }
 
 define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
-; CHECK: st3_16b
+; CHECK-LABEL: st3_16b
 ; CHECK st3.16b
 	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
 	ret void
 }
 
 define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
-; CHECK: st4_16b
+; CHECK-LABEL: st4_16b
 ; CHECK st4.16b
 	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
 	ret void
@@ -183,21 +231,21 @@ declare void @llvm.arm64.neon.st3.v16i8.
 declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
 
 define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
-; CHECK: st2_4h
+; CHECK-LABEL: st2_4h
 ; CHECK st2.4h
 	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
 	ret void
 }
 
 define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
-; CHECK: st3_4h
+; CHECK-LABEL: st3_4h
 ; CHECK st3.4h
 	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
 	ret void
 }
 
 define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
-; CHECK: st4_4h
+; CHECK-LABEL: st4_4h
 ; CHECK st4.4h
 	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
 	ret void
@@ -208,21 +256,21 @@ declare void @llvm.arm64.neon.st3.v4i16.
 declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
 
 define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
-; CHECK: st2_8h
+; CHECK-LABEL: st2_8h
 ; CHECK st2.8h
 	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
 	ret void
 }
 
 define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
-; CHECK: st3_8h
+; CHECK-LABEL: st3_8h
 ; CHECK st3.8h
 	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
 	ret void
 }
 
 define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
-; CHECK: st4_8h
+; CHECK-LABEL: st4_8h
 ; CHECK st4.8h
 	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
 	ret void
@@ -233,21 +281,21 @@ declare void @llvm.arm64.neon.st3.v8i16.
 declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
 
 define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
-; CHECK: st2_2s
+; CHECK-LABEL: st2_2s
 ; CHECK st2.2s
 	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
 	ret void
 }
 
 define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
-; CHECK: st3_2s
+; CHECK-LABEL: st3_2s
 ; CHECK st3.2s
 	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
 	ret void
 }
 
 define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
-; CHECK: st4_2s
+; CHECK-LABEL: st4_2s
 ; CHECK st4.2s
 	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
 	ret void
@@ -258,21 +306,21 @@ declare void @llvm.arm64.neon.st3.v2i32.
 declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
 
 define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
-; CHECK: st2_4s
+; CHECK-LABEL: st2_4s
 ; CHECK st2.4s
 	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
 	ret void
 }
 
 define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
-; CHECK: st3_4s
+; CHECK-LABEL: st3_4s
 ; CHECK st3.4s
 	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
 	ret void
 }
 
 define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
-; CHECK: st4_4s
+; CHECK-LABEL: st4_4s
 ; CHECK st4.4s
 	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
 	ret void
@@ -283,21 +331,21 @@ declare void @llvm.arm64.neon.st3.v4i32.
 declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
 
 define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
-; CHECK: st2_1d
+; CHECK-LABEL: st2_1d
 ; CHECK st1.2d
 	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
 	ret void
 }
 
 define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
-; CHECK: st3_1d
+; CHECK-LABEL: st3_1d
 ; CHECK st1.3d
 	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
 	ret void
 }
 
 define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
-; CHECK: st4_1d
+; CHECK-LABEL: st4_1d
 ; CHECK st1.4d
 	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
 	ret void
@@ -308,21 +356,21 @@ declare void @llvm.arm64.neon.st3.v1i64.
 declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
 
 define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
-; CHECK: st2_2d
+; CHECK-LABEL: st2_2d
 ; CHECK st2.2d
 	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
 	ret void
 }
 
 define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
-; CHECK: st3_2d
+; CHECK-LABEL: st3_2d
 ; CHECK st2.3d
 	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
 	ret void
 }
 
 define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
-; CHECK: st4_2d
+; CHECK-LABEL: st4_2d
 ; CHECK st2.4d
 	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
 	ret void