[llvm] (WIP) [AArch64] Make use of byte FPR stores for bytes extracted from vectors (PR #134117)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 2 15:25:37 PDT 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/134117
>From e5cb18b70954c7557b407e3ff874cada5fe864c9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 4 Mar 2025 16:18:34 +0000
Subject: [PATCH 1/3] [AArch64] Make use of byte FPR stores bytes extracted
from vectors
This helps avoid some pointless `fmovs` in some cases. Currently, this
is done in ISEL as FPR bytes are problematic in SDAG (as neither GPR
or FPR bytes are a legal type).
---
llvm/include/llvm/CodeGen/ValueTypes.td | 2 +
llvm/lib/CodeGen/ValueTypes.cpp | 2 +
.../Target/AArch64/AArch64ISelLowering.cpp | 1 +
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 +-
.../lib/Target/AArch64/AArch64RegisterInfo.td | 2 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 37 ++++
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 159 ++++++++++++++----
llvm/test/CodeGen/AArch64/add.ll | 3 +-
llvm/test/CodeGen/AArch64/andorxor.ll | 9 +-
.../test/CodeGen/AArch64/arm64-collect-loh.ll | 9 +-
llvm/test/CodeGen/AArch64/arm64-st1.ll | 36 ++--
llvm/test/CodeGen/AArch64/bitcast-v2i8.ll | 3 +-
llvm/test/CodeGen/AArch64/ctlz.ll | 3 +-
llvm/test/CodeGen/AArch64/ctpop.ll | 3 +-
llvm/test/CodeGen/AArch64/cttz.ll | 3 +-
.../CodeGen/AArch64/extract-vector-cmp.ll | 7 +-
llvm/test/CodeGen/AArch64/mul.ll | 3 +-
llvm/test/CodeGen/AArch64/neon-truncstore.ll | 6 +-
llvm/test/CodeGen/AArch64/nontemporal-load.ll | 3 +-
llvm/test/CodeGen/AArch64/pr-cf624b2.ll | 6 +-
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 5 +-
.../CodeGen/AArch64/setcc-type-mismatch.ll | 3 +-
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 5 +-
llvm/test/CodeGen/AArch64/store.ll | 3 +-
llvm/test/CodeGen/AArch64/sub.ll | 3 +-
...-streaming-mode-fixed-length-ld2-alloca.ll | 9 +-
...mode-fixed-length-masked-gather-scatter.ll | 12 +-
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 5 +-
llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 5 +-
.../vec-combine-compare-truncate-store.ll | 11 +-
.../AArch64/vec3-loads-ext-trunc-stores.ll | 26 ++-
llvm/test/CodeGen/AArch64/vector-compress.ll | 2 +-
32 files changed, 261 insertions(+), 136 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index fc1a95e33380b..42c4830e94220 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -338,6 +338,8 @@ def amdgpuBufferFatPointer : ValueType<160, 234>;
// FIXME: Remove this and the getPointerType() override if MVT::i82 is added.
def amdgpuBufferStridedPointer : ValueType<192, 235>;
+def vi8 : ValueType<8, 236>; // 8-bit integer in FPR (AArch64)
+
let isNormalValueType = false in {
def token : ValueType<0, 504>; // TokenTy
def MetadataVT : ValueType<0, 505> { // Metadata
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 0554b6387c5e6..c769568253b12 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -198,6 +198,8 @@ std::string EVT::getEVTString() const {
return "amdgpuBufferFatPointer";
case MVT::amdgpuBufferStridedPointer:
return "amdgpuBufferStridedPointer";
+ case MVT::vi8:
+ return "vi8";
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1c8e3afdfd718..5fec669da9c33 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -401,6 +401,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasFPARMv8()) {
+ addRegisterClass(MVT::vi8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6c61e3a613f6f..1c1ff656db910 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3575,7 +3575,7 @@ defm LDRW : LoadUI<0b10, 0, 0b01, GPR32z, uimm12s4, "ldr",
(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDRB : LoadUI<0b00, 1, 0b01, FPR8Op, uimm12s1, "ldr",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
defm LDRH : LoadUI<0b01, 1, 0b01, FPR16Op, uimm12s2, "ldr",
[(set (f16 FPR16Op:$Rt),
@@ -3763,7 +3763,7 @@ defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32z, "ldur",
(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
let Predicates = [HasFPARMv8] in {
defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
- [(set FPR8Op:$Rt,
+ [(set (i8 FPR8Op:$Rt),
(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
[(set (f16 FPR16Op:$Rt),
@@ -4333,7 +4333,7 @@ defm STRW : StoreUIz<0b10, 0, 0b00, GPR32z, uimm12s4, "str",
(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STRB : StoreUI<0b00, 1, 0b00, FPR8Op, uimm12s1, "str",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
defm STRH : StoreUI<0b01, 1, 0b00, FPR16Op, uimm12s2, "str",
[(store (f16 FPR16Op:$Rt),
@@ -4451,6 +4451,8 @@ multiclass VecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop
}
let AddedComplexity = 19 in {
+ defm : VecStoreLane0Pat<am_indexed8, truncstorei8, v16i8, i32, vi8, bsub, uimm12s2, STRBui>;
+ defm : VecStoreLane0Pat<am_indexed8, truncstorei8, v4i32, i32, vi8, bsub, uimm12s2, STRBui>;
defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, i32, ssub, uimm12s4, STRSui>;
@@ -4469,7 +4471,7 @@ defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32z, "stur",
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
let Predicates = [HasFPARMv8] in {
defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8Op, "stur",
- [(store FPR8Op:$Rt,
+ [(store (i8 FPR8Op:$Rt),
(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16Op, "stur",
[(store (f16 FPR16Op:$Rt),
@@ -4598,6 +4600,7 @@ multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
}
let AddedComplexity = 19 in {
+ defm : VecStoreULane0Pat<truncstorei8, v16i8, i32, vi8, bsub, STURBi>;
defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v8f16, f16, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v4i32, i32, i32, ssub, STURSi>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index fed9b7b173e9c..42ba1451650ed 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -497,7 +497,7 @@ def Q30 : AArch64Reg<30, "q30", [D30, D30_HI], ["v30", ""]>, DwarfRegAlias<B30
def Q31 : AArch64Reg<31, "q31", [D31, D31_HI], ["v31", ""]>, DwarfRegAlias<B31>;
}
-def FPR8 : RegisterClass<"AArch64", [i8], 8, (sequence "B%u", 0, 31)> {
+def FPR8 : RegisterClass<"AArch64", [i8, vi8], 8, (sequence "B%u", 0, 31)> {
let Size = 8;
let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::FPR8RegClassID, 0, 32>";
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3ee71c14c6bd4..1884a90828acb 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1827,6 +1827,43 @@ let Predicates = [HasSVE] in {
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
} // End HasSVE
+multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
+ ValueType VTy, ValueType STy,
+ ValueType SubRegTy,
+ SubRegIndex SubRegIdx, Operand IndexType,
+ Instruction STR,
+ Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
+ let Predicates = [HasSVE_or_SME] in {
+ // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+ }
+
+ // Non-zero immediate index:
+ def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
+ (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
+ (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
+ GPR64sp:$Rn, IndexType:$offset)>;
+}
+
+// Note: Types other than i8 are handled in performSTORECombine -- i8 is tricky
+// to handle before ISEL as it is not really a legal type in many places, nor
+// is its equivalently sized FP variant.
+let AddedComplexity = 19 in {
+ // Lane 0 truncating stores
+ // i32 -> i8
+ defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv4i32, i32, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv4i32, i32, vi8, bsub, simm9, STURBi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
+ // i64 -> i8
+ defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv2i64, i64, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv2i64, i64, vi8, bsub, simm9, STURBi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
+ // i8 -> i8 (technically a truncate as the extracted type is i32)
+ defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv16i8, i32, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_B, sve_elm_idx_extdup_b>;
+ defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv16i8, i32, vi8, bsub, simm9, STURBi, DUP_ZZI_B, sve_elm_idx_extdup_b>;
+}
+
let Predicates = [HasSVE_or_SME] in {
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index d39c9bf760621..b91cb872a9e0a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING
-; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
@@ -106,18 +106,11 @@ entry:
}
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: strb w8, [x0]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s8:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: strb w8, [x0]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, z0.b[7]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
@@ -128,8 +121,7 @@ entry:
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
entry:
@@ -201,6 +193,19 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
ret void
}
+define void @test_str_reduction_i32_to_i8(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ store i8 %trunc, ptr %ptr, align 1
+ ret void
+}
+
define void @test_str_reduction_i32_to_i32_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
; CHECK-LABEL: test_str_reduction_i32_to_i32_negative_offset:
; CHECK: // %bb.0:
@@ -242,6 +247,20 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
ret void
}
+define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uaddv d0, p0, z0.s
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+ %reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
+ %trunc = trunc i64 %reduce to i8
+ %out_ptr = getelementptr inbounds i8, ptr %ptr, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
define void @test_str_lane_s32_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_lane_s32_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -297,18 +316,11 @@ entry:
}
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
-; CHECK-NONSTREAMING: // %bb.0: // %entry
-; CHECK-NONSTREAMING-NEXT: umov w8, v0.b[7]
-; CHECK-NONSTREAMING-NEXT: sturb w8, [x0, #-8]
-; CHECK-NONSTREAMING-NEXT: ret
-;
-; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
-; STREAMING-COMPAT: // %bb.0: // %entry
-; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
-; STREAMING-COMPAT-NEXT: fmov w8, s0
-; STREAMING-COMPAT-NEXT: sturb w8, [x0, #-8]
-; STREAMING-COMPAT-NEXT: ret
+; CHECK-LABEL: test_str_lane_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.b, z0.b[7]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
@@ -320,8 +332,7 @@ entry:
define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: test_str_lane0_s8_negative_offset:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: sturb w8, [x0, #-8]
+; CHECK-NEXT: stur b0, [x0, #-8]
; CHECK-NEXT: ret
entry:
@@ -385,6 +396,48 @@ entry:
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ store i8 %trunc, ptr %a, align 1
+ ret void
+}
+
define void @test_str_trunc_lane_s32_to_s16_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: test_str_trunc_lane_s32_to_s16_negative_offset:
; CHECK: // %bb.0: // %entry
@@ -413,3 +466,47 @@ entry:
store i16 %trunc, ptr %out_ptr, align 2
ret void
}
+
+define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.s, z0.s[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 3
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 4 x i32> %b, i32 0
+ %trunc = trunc i32 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
+
+define void @test_str_trunc_lane_s64_to_s8_negative_offset(ptr %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_str_trunc_lane_s64_to_s8_negative_offset:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov z0.d, z0.d[3]
+; CHECK-NEXT: stur b0, [x0, #-8]
+; CHECK-NEXT: ret
+
+entry:
+ %0 = extractelement <vscale x 2 x i64> %b, i32 3
+ %trunc = trunc i64 %0 to i8
+ %out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
+ store i8 %trunc, ptr %out_ptr, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index fc0ba336b21cc..cdde359d09d7b 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -64,8 +64,7 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 24f2549cce785..03c7bad9efc22 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -184,8 +184,7 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
@@ -221,8 +220,7 @@ define void @or_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
@@ -258,8 +256,7 @@ define void @xor_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 2c065e0051cd7..7f2bebf584d8f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -615,11 +615,10 @@ define <1 x i8> @getL() {
; CHECK-NEXT: ; kill
; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L at GOTPAGEOFF]
-; Ultimately we should generate str b0, but right now, we match the vector
-; variant which does not allow to fold the immediate into the store.
-; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str b0, [x[[LDRGOT_REG]]]
; CHECK-NEXT: ret
-; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
define void @setL(<1 x i8> %t) {
store <1 x i8> %t, ptr @L, align 4
ret void
@@ -678,6 +677,6 @@ if.end.i:
call void (ptr, ...) @callee(ptr @.str.89, ptr @.str.90, double %sub)
unreachable
}
-declare void @callee(ptr nocapture readonly, ...)
+declare void @callee(ptr nocapture readonly, ...)
attributes #0 = { "target-cpu"="cyclone" }
diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll
index 6f87c66c87345..c63d66c4e7706 100644
--- a/llvm/test/CodeGen/AArch64/arm64-st1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SD-CHECK
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,GI-CHECK
; The instruction latencies of Exynos-M3 trigger the transform we see under the Exynos check.
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m3 | FileCheck --check-prefix=EXYNOS %s
@@ -13,8 +13,11 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) {
}
define void @st1lane0_16b(<16 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane0_16b
-; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
+; SD-CHECK-LABEL: st1lane0_16b
+; SD-CHECK: str b0, [x{{[0-9]+}}, #1]
+
+; GI-CHECK-LABEL: st1lane0_16b
+; GI-CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
%ptr = getelementptr i8, ptr %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
@@ -22,8 +25,11 @@ define void @st1lane0_16b(<16 x i8> %A, ptr %D) {
}
define void @st1lane0u_16b(<16 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane0u_16b
-; CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
+; SD-CHECK-LABEL: st1lane0u_16b
+; SD-CHECK: stur b0, [x{{[0-9]+}}, #-1]
+
+; GI-CHECK-LABEL: st1lane0u_16b
+; GI-CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
%ptr = getelementptr i8, ptr %D, i64 -1
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
@@ -41,9 +47,12 @@ define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
}
define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane0_ro_16b
-; CHECK: add x[[XREG:[0-9]+]], x0, x1
-; CHECK: st1.b { v0 }[0], [x[[XREG]]]
+; SD-CHECK-LABEL: st1lane0_ro_16b
+; SD-CHECK: str b0, [x0, x1]
+
+; GI-CHECK-LABEL: st1lane0_ro_16b
+; GI-CHECK: add x[[XREG:[0-9]+]], x0, x1
+; GI-CHECK: st1.b { v0 }[0], [x[[XREG]]]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <16 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
@@ -300,9 +309,12 @@ define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
}
define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane0_ro_8b
-; CHECK: add x[[XREG:[0-9]+]], x0, x1
-; CHECK: st1.b { v0 }[0], [x[[XREG]]]
+; SD-CHECK-LABEL: st1lane0_ro_8b
+; SD-CHECK: str b0, [x0, x1]
+
+; GI-CHECK-LABEL: st1lane0_ro_8b
+; GI-CHECK: add x[[XREG:[0-9]+]], x0, x1
+; GI-CHECK: st1.b { v0 }[0], [x[[XREG]]]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <8 x i8> %A, i32 0
store i8 %tmp, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
index aff3ffc70a711..77304aef4385e 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
@@ -5,9 +5,8 @@
define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) {
; CHECK-LABEL: test_bitcast_v2i8_to_i16
; CHECK: mov.s [[WREG_HI:w[0-9]+]], v0[1]
-; CHECK-NEXT: fmov [[WREG_LO:w[0-9]+]], s0
; CHECK-NEXT: strb [[WREG_HI]], [sp, #15]
-; CHECK-NEXT: strb [[WREG_LO]], [sp, #14]
+; CHECK-NEXT: str [[WREG_LO:b[0-9]+]], [sp, #14]
; CHECK-NEXT: ldrh w0, [sp, #14]
%aa = bitcast <2 x i8> %a to i16
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 742433c50d390..79676efebe776 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -14,8 +14,7 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index c7c378d3e67cd..767b9d28d6215 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -14,8 +14,7 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index 41843e03cb81e..97f5a29064c67 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -17,8 +17,7 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: clz v0.2s, v0.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
index 8345fdfa46b4c..f076ee12427d8 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll
@@ -184,17 +184,16 @@ define i1 @extract_icmp_v4i32_splat_rhs_mul_use(<4 x i32> %a, ptr %p) {
; CHECK-LABEL: extract_icmp_v4i32_splat_rhs_mul_use:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #235
-; CHECK-NEXT: adrp x9, .LCPI8_0
+; CHECK-NEXT: adrp x8, .LCPI8_0
+; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0]
; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0]
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v1.4h, v0.4s
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: umov w9, v1.h[1]
-; CHECK-NEXT: fmov w10, s0
+; CHECK-NEXT: str b0, [x8]
; CHECK-NEXT: and w0, w9, #0x1
-; CHECK-NEXT: strb w10, [x8]
; CHECK-NEXT: ret
%icmp = icmp ult <4 x i32> %a, splat(i32 235)
%ext = extractelement <4 x i1> %icmp, i32 1
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 500379d1cfdec..0d7a6a7dbcb11 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -76,8 +76,7 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index 3d3362d314a99..a070e3d7565ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -90,8 +90,7 @@ define void @v2i32_v2i8(<2 x i32> %a, ptr %result) {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w9, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: strb w8, [x0, #1]
; CHECK-NEXT: ret
%b = trunc <2 x i32> %a to <2 x i8>
@@ -157,8 +156,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: strb w9, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: strb w8, [x0, #1]
; CHECK-NEXT: ret
%b = trunc <2 x i16> %a to <2 x i8>
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index 959ac7f68e351..28cff55beff9e 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -449,10 +449,9 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) {
; CHECK-LABEL: test_ldnp_v33i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldnp q0, q1, [x0]
-; CHECK-NEXT: add x9, x8, #32
; CHECK-NEXT: ldr b2, [x0, #32]
; CHECK-NEXT: stp q0, q1, [x8]
-; CHECK-NEXT: st1.b { v2 }[0], [x9]
+; CHECK-NEXT: str b2, [x8, #32]
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v33i8:
diff --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
index ea9588e9e3db7..0b0540e559abd 100644
--- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
+++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
@@ -14,9 +14,9 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) {
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: str b0, [sp]
; CHECK-NEXT: orr x9, x8, #0xf
; CHECK-NEXT: orr x10, x8, #0xe
-; CHECK-NEXT: st1 { v0.b }[0], [x8]
; CHECK-NEXT: st1 { v0.b }[15], [x9]
; CHECK-NEXT: orr x9, x8, #0xc
; CHECK-NEXT: st1 { v0.b }[12], [x9]
@@ -46,9 +46,9 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) {
; CHECK-NEXT: mov w10, #9 // =0x9
; CHECK-NEXT: st1 { v0.b }[10], [x9]
; CHECK-NEXT: orr x9, x8, x10
+; CHECK-NEXT: mov w10, #5 // =0x5
+; CHECK-NEXT: orr x8, x8, x10
; CHECK-NEXT: st1 { v0.b }[9], [x9]
-; CHECK-NEXT: mov w9, #5 // =0x5
-; CHECK-NEXT: orr x8, x8, x9
; CHECK-NEXT: st1 { v0.b }[5], [x8]
; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 4d76994be204f..cbb3b06030bae 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -201,8 +201,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: strb w8, [x2, #1]
; CHECK-SD-NEXT: ret
;
@@ -325,7 +324,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: sqadd v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
index c0a728014e390..950ac92a8b12f 100644
--- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
+++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
@@ -12,8 +12,7 @@ define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
%tst = icmp eq <4 x i22> %l, %r
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index ae2a16929e254..04b379f455008 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -202,8 +202,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: strb w8, [x2, #1]
; CHECK-SD-NEXT: ret
;
@@ -326,7 +325,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: sqsub v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 37a6ad08d4cb3..7ea957d9d165d 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -111,8 +111,7 @@ define void @store_v2i8(<2 x i8> %a, ptr %ptr){
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 8183a82f21cb5..91a17a89af6e1 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -64,8 +64,7 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x0]
+; CHECK-SD-NEXT: str b0, [x0]
; CHECK-SD-NEXT: strb w8, [x0, #1]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index aa1adfd306a4c..89a06bc9d5b4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -73,8 +73,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: zip1 z1.s, z1.s, z0.s
; CHECK-NEXT: st1b { z1.h }, p0, [x8]
; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x19, #2]
+; CHECK-NEXT: str b0, [x19, #2]
; CHECK-NEXT: str h1, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #32
@@ -119,11 +118,11 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: mov x0, sp
; CHECK-NEXT: bl def
; CHECK-NEXT: adrp x8, .LCPI2_0
-; CHECK-NEXT: ldp q0, q2, [sp]
+; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: strb w8, [x19, #8]
+; CHECK-NEXT: ldr q1, [sp, #16]
+; CHECK-NEXT: str b1, [x19, #8]
; CHECK-NEXT: str d0, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
index d9f8482a3c503..b1ac9469c0573 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll
@@ -20,9 +20,8 @@ define <2 x i64> @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: uaddv d0, p0, z0.d
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: and w8, w8, #0xff
+; CHECK-NEXT: str b0, [sp, #12]
+; CHECK-NEXT: ldrb w8, [sp, #12]
; CHECK-NEXT: tbz w8, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %cond.load
; CHECK-NEXT: fmov x9, d1
@@ -109,11 +108,10 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) {
; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0
; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff
; CHECK-NEXT: and z1.d, z2.d, z1.d
-; CHECK-NEXT: uaddv d1, p0, z1.d
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: uaddv d2, p0, z1.d
; CHECK-NEXT: ldr q1, [x1]
-; CHECK-NEXT: strb w8, [sp, #12]
-; CHECK-NEXT: and w8, w8, #0xff
+; CHECK-NEXT: str b2, [sp, #12]
+; CHECK-NEXT: ldrb w8, [sp, #12]
; CHECK-NEXT: tbnz w8, #0, .LBB1_3
; CHECK-NEXT: // %bb.1: // %else
; CHECK-NEXT: tbnz w8, #1, .LBB1_4
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index d0173307bd830..edd96ae4836a4 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -199,8 +199,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: strb w8, [x2, #1]
; CHECK-SD-NEXT: ret
;
@@ -324,7 +323,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: uqadd v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index dc3ebfb0682ca..63ca1b51c2291 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -198,8 +198,7 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: strb w9, [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: strb w8, [x2, #1]
; CHECK-SD-NEXT: ret
;
@@ -321,7 +320,7 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: ldr b1, [x1]
; CHECK-SD-NEXT: uqsub v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: st1 { v0.b }[0], [x2]
+; CHECK-SD-NEXT: str b0, [x2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v1i8:
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index dd7a9c6d7768b..d9b5a42ba98a6 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -56,8 +56,7 @@ define void @store_4_elements(<4 x i32> %vec, ptr %out) {
; CHECK-NEXT: ldr q1, [x8, lCPI2_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
@@ -99,8 +98,7 @@ define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) {
; CHECK-NEXT: cmlt.4s v0, v0, #0
; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: addv.4s s0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9
@@ -141,7 +139,7 @@ define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) {
; CHECK-NEXT: ldr d1, [x8, lCPI6_0 at PAGEOFF]
; CHECK-NEXT: bic.8b v0, v1, v0
; CHECK-NEXT: addv.8b b0, v0
-; CHECK-NEXT: st1.b { v0 }[0], [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13
@@ -182,8 +180,7 @@ define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) {
; CHECK-NEXT: ldr d1, [x8, lCPI8_0 at PAGEOFF]
; CHECK-NEXT: bic.8b v0, v1, v0
; CHECK-NEXT: addp.2s v0, v0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh17
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index b52cbfe08156b..8ab8f537398ae 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -403,7 +403,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: add.4h v0, v0, v1
; CHECK-NEXT: st1.b { v0 }[2], [x8]
; CHECK-NEXT: st1.b { v0 }[4], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: str b0, [x1]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
;
@@ -592,7 +592,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; CHECK-NEXT: ushr.4s v0, v0, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: str b0, [x1]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store:
@@ -626,7 +626,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; CHECK-NEXT: ushr.4s v0, v0, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: str b0, [x1]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_default_align:
@@ -660,7 +660,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; CHECK-NEXT: ushr.4s v0, v0, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x1]
+; CHECK-NEXT: str b0, [x1]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_align_4:
@@ -693,9 +693,8 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; CHECK-NEXT: add x9, x1, #3
; CHECK-NEXT: ushr.4s v0, v0, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: add x8, x1, #1
; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: str b0, [x1, #1]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_const_offset_1:
@@ -729,9 +728,8 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; CHECK-NEXT: add x9, x1, #5
; CHECK-NEXT: ushr.4s v0, v0, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: add x8, x1, #3
; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: str b0, [x1, #3]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_const_offset_3:
@@ -807,12 +805,12 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: add x9, x0, #1
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
; CHECK-NEXT: st1.b { v0 }[8], [x8]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: st1.b { v0 }[0], [x0]
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: st1.b { v0 }[4], [x9]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
;
@@ -860,12 +858,12 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
+; CHECK-NEXT: add x9, x0, #1
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
; CHECK-NEXT: st1.b { v0 }[8], [x8]
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: st1.b { v0 }[0], [x0]
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: st1.b { v0 }[4], [x9]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
;
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 710ea70d678c5..f990bdc2e5615 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -109,7 +109,7 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
; CHECK-NEXT: shl.16b v1, v1, #7
; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: st1.b { v0 }[0], [x8]
+; CHECK-NEXT: str b0, [sp]
; CHECK-NEXT: mov x13, sp
; CHECK-NEXT: cmlt.16b v1, v1, #0
; CHECK-NEXT: umov.b w9, v1[0]
>From ef964aa5a330f29c1a90c1559213ffbddf42e06f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 2 Apr 2025 17:01:02 +0000
Subject: [PATCH 2/3] WIP: Attempt vector truncstore
---
.../Target/AArch64/AArch64ISelLowering.cpp | 40 ++++-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 20 ++-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 39 +---
.../CodeGen/AArch64/aarch64-sve-ldst-one.ll | 166 ++++++++++++------
llvm/test/CodeGen/AArch64/add.ll | 12 +-
llvm/test/CodeGen/AArch64/andorxor.ll | 36 ++--
.../test/CodeGen/AArch64/arm64-collect-loh.ll | 1 -
.../AArch64/arm64-neon-simd-ldst-one.ll | 31 +++-
llvm/test/CodeGen/AArch64/arm64-st1.ll | 38 ++--
llvm/test/CodeGen/AArch64/bitcast-v2i8.ll | 17 +-
llvm/test/CodeGen/AArch64/ctlz.ll | 12 +-
llvm/test/CodeGen/AArch64/ctpop.ll | 12 +-
llvm/test/CodeGen/AArch64/cttz.ll | 12 +-
llvm/test/CodeGen/AArch64/mul.ll | 12 +-
llvm/test/CodeGen/AArch64/neon-truncstore.ll | 8 +-
llvm/test/CodeGen/AArch64/nontemporal-load.ll | 2 +-
llvm/test/CodeGen/AArch64/pr-cf624b2.ll | 60 +++----
llvm/test/CodeGen/AArch64/sadd_sat_vec.ll | 4 +-
llvm/test/CodeGen/AArch64/ssub_sat_vec.ll | 4 +-
llvm/test/CodeGen/AArch64/store.ll | 12 +-
llvm/test/CodeGen/AArch64/sub.ll | 12 +-
...-streaming-mode-fixed-length-ld2-alloca.ll | 4 +-
llvm/test/CodeGen/AArch64/trunc-to-tbl.ll | 27 +--
llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 4 +-
llvm/test/CodeGen/AArch64/usub_sat_vec.ll | 4 +-
.../vec-combine-compare-truncate-store.ll | 12 +-
.../AArch64/vec3-loads-ext-trunc-stores.ll | 156 ++++++++--------
llvm/test/CodeGen/AArch64/vec_uaddo.ll | 5 +-
llvm/test/CodeGen/AArch64/vec_umulo.ll | 7 +-
llvm/test/CodeGen/AArch64/vector-compress.ll | 155 ++++++++--------
llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 29 +--
31 files changed, 530 insertions(+), 423 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5fec669da9c33..8dbc64cebef84 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1394,6 +1394,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal);
+
for (auto Op :
{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
@@ -23989,6 +23991,22 @@ static unsigned getFPSubregForVT(EVT VT) {
}
}
+static EVT get64BitVector(EVT ElVT) {
+ assert(ElVT.isSimple() && "Expected simple VT");
+ switch (ElVT.getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ return MVT::v8i8;
+ case MVT::i16:
+ return MVT::v4i16;
+ case MVT::i32:
+ return MVT::v2i32;
+ case MVT::i64:
+ return MVT::v1i64;
+ default:
+ llvm_unreachable("Unexpected VT!");
+ }
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -24067,11 +24085,27 @@ static SDValue performSTORECombine(SDNode *N,
SDValue ExtIdx = Value.getOperand(1);
EVT VectorVT = Vector.getValueType();
EVT ElemVT = VectorVT.getVectorElementType();
- if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
+ if (!ValueVT.isInteger())
return SDValue();
if (ValueVT != MemVT && !ST->isTruncatingStore())
return SDValue();
+ if (MemVT == MVT::i8) {
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ Value.getValueType(), Vector, ExtIdx);
+ EVT VecVT64 = get64BitVector(ElemVT);
+ SDValue ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64,
+ DAG.getUNDEF(VecVT64), Ext, Zero);
+ SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector);
+ return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(),
+ MVT::v1i8, ST->getMemOperand());
+ }
+
+ // TODO: Handle storing i8s to wider types.
+ if (ElemVT == MVT::i8)
+ return SDValue();
+
// Heuristic: If there are other users of integer scalars extracted from
// this vector that won't fold into the store -- abandon folding. Applying
// this fold may extend the vector lifetime and disrupt paired stores.
@@ -28826,6 +28860,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+ // Can be lowered to a bsub store in ISEL.
+ if (VT == MVT::v1i64 && MemVT == MVT::v1i8)
+ return SDValue();
+
if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
EVT TruncVT = ContainerVT.changeVectorElementType(
Store->getMemoryVT().getVectorElementType());
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1c1ff656db910..322ac34af6dce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4451,8 +4451,6 @@ multiclass VecStoreLane0Pat<ComplexPattern UIAddrMode, SDPatternOperator storeop
}
let AddedComplexity = 19 in {
- defm : VecStoreLane0Pat<am_indexed8, truncstorei8, v16i8, i32, vi8, bsub, uimm12s2, STRBui>;
- defm : VecStoreLane0Pat<am_indexed8, truncstorei8, v4i32, i32, vi8, bsub, uimm12s2, STRBui>;
defm : VecStoreLane0Pat<am_indexed16, truncstorei16, v8i16, i32, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed16, store, v8f16, f16, f16, hsub, uimm12s2, STRHui>;
defm : VecStoreLane0Pat<am_indexed32, store, v4i32, i32, i32, ssub, uimm12s4, STRSui>;
@@ -4591,6 +4589,18 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+// v1i64 -> bsub truncating stores
+// Supporting pattern lower f32/64 -> v8i8
+def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i32 FPR32:$src), 0)),
+ (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+def : Pat<(v8i8 (vector_insert (v8i8 (undef)), (i64 FPR64:$src), 0)),
+ (v8i8 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub), dsub))>;
+// Lower v1i64 -> v1i8 truncstore to bsub store
+def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+ (STURBi (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)),
+ (STRBui (vi8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>;
+
// Match stores from lane 0 to the appropriate subreg's store.
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
ValueType VTy, ValueType STy,
@@ -4600,7 +4610,6 @@ multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,
}
let AddedComplexity = 19 in {
- defm : VecStoreULane0Pat<truncstorei8, v16i8, i32, vi8, bsub, STURBi>;
defm : VecStoreULane0Pat<truncstorei16, v8i16, i32, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v8f16, f16, f16, hsub, STURHi>;
defm : VecStoreULane0Pat<store, v4i32, i32, i32, ssub, STURSi>;
@@ -7242,6 +7251,11 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE
(INS V128:$src, imm:$Immd,
(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+ def : Pat<(VT64 (vector_insert (VT64 (undef)),
+ (VTScal (vector_extract (VT128 V128:$Rn), (i64 0))),
+ (i64 0))),
+ (EXTRACT_SUBREG $Rn, dsub)>;
+
def : Pat<(VT64 (vector_insert V64:$src,
(VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1884a90828acb..8179b253a86de 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1827,43 +1827,6 @@ let Predicates = [HasSVE] in {
defm : adrXtwShiftPat<nxv2i64, nxv2i1, 3>;
} // End HasSVE
-multiclass SVEVecStoreLanePat<ComplexPattern UIAddrMode, SDPatternOperator storeop,
- ValueType VTy, ValueType STy,
- ValueType SubRegTy,
- SubRegIndex SubRegIdx, Operand IndexType,
- Instruction STR,
- Instruction DUP, AsmVectorIndexOpnd DUPIdxTy> {
- let Predicates = [HasSVE_or_SME] in {
- // Same as Neon VecStoreLane0Pat but without matching VecListOne128.
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, (i64 0))),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG $Vt, SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
- }
-
- // Non-zero immediate index:
- def : Pat<(storeop (STy (vector_extract VTy:$Vt, DUPIdxTy:$idx)),
- (UIAddrMode GPR64sp:$Rn, IndexType:$offset)),
- (STR (SubRegTy (EXTRACT_SUBREG (DUP $Vt, DUPIdxTy:$idx), SubRegIdx)),
- GPR64sp:$Rn, IndexType:$offset)>;
-}
-
-// Note: Types other than i8 are handled in performSTORECombine -- i8 is tricky
-// to handle before ISEL as it is not really a legal type in many places, nor
-// is its equivalently sized FP variant.
-let AddedComplexity = 19 in {
- // Lane 0 truncating stores
- // i32 -> i8
- defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv4i32, i32, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv4i32, i32, vi8, bsub, simm9, STURBi, DUP_ZZI_S, sve_elm_idx_extdup_s>;
- // i64 -> i8
- defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv2i64, i64, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv2i64, i64, vi8, bsub, simm9, STURBi, DUP_ZZI_D, sve_elm_idx_extdup_d>;
- // i8 -> i8 (technically a truncate as the extracted type is i32)
- defm : SVEVecStoreLanePat<am_indexed8, truncstorei8, nxv16i8, i32, vi8, bsub, uimm12s4, STRBui, DUP_ZZI_B, sve_elm_idx_extdup_b>;
- defm : SVEVecStoreLanePat<am_unscaled8, truncstorei8, nxv16i8, i32, vi8, bsub, simm9, STURBi, DUP_ZZI_B, sve_elm_idx_extdup_b>;
-}
-
let Predicates = [HasSVE_or_SME] in {
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
@@ -3245,6 +3208,8 @@ let Predicates = [HasSVE_or_SME] in {
// Insert scalar into undef[0]
def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+ def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i64 FPR64:$src), 0)),
+ (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
(INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
index b91cb872a9e0a..598aa69e30fa6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-ldst-one.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-NONSTREAMING
+; RUN: llc < %s -verify-machineinstrs -mattr=+sme -global-isel=0 -force-streaming | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
+; RUN: llc < %s -verify-machineinstrs -mattr=+sve -global-isel=0 -force-streaming-compatible | FileCheck %s --check-prefixes=CHECK,STREAMING-COMPAT
target triple = "aarch64-unknown-linux-gnu"
@@ -106,12 +106,17 @@ entry:
}
define void @test_str_lane_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z0.b, z0.b[7]
-; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: ret
-
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s8:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT: str b0, [x0]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
store i8 %0, ptr %a, align 1
@@ -119,11 +124,16 @@ entry:
}
define void @test_str_lane0_s8(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane0_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: ret
-
+; CHECK-NONSTREAMING-LABEL: test_str_lane0_s8:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[0]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str b0, [x0]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
store i8 %0, ptr %a, align 1
@@ -194,11 +204,18 @@ define void @test_str_reduction_i32_to_i16(ptr %ptr, <vscale x 4 x i1> %p0, <vsc
}
define void @test_str_reduction_i32_to_i8(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
-; CHECK-LABEL: test_str_reduction_i32_to_i8:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_reduction_i32_to_i8:
+; CHECK-NONSTREAMING: // %bb.0:
+; CHECK-NONSTREAMING-NEXT: uaddv d0, p0, z0.s
+; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i8:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: str b0, [x0]
+; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i8
@@ -248,11 +265,18 @@ define void @test_str_reduction_i32_to_i16_negative_offset(ptr %ptr, <vscale x 4
}
define void @test_str_reduction_i32_to_i8_negative_offset(ptr %ptr, <vscale x 4 x i1> %p0, <vscale x 4 x i32> %v) {
-; CHECK-LABEL: test_str_reduction_i32_to_i8_negative_offset:
-; CHECK: // %bb.0:
-; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: stur b0, [x0, #-8]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_reduction_i32_to_i8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0:
+; CHECK-NONSTREAMING-NEXT: uaddv d0, p0, z0.s
+; CHECK-NONSTREAMING-NEXT: mov v0.d[0], v0.d[0]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_reduction_i32_to_i8_negative_offset:
+; STREAMING-COMPAT: // %bb.0:
+; STREAMING-COMPAT-NEXT: uaddv d0, p0, z0.s
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
%reduce = tail call i64 @llvm.aarch64.sve.uaddv.nxv4i32(<vscale x 4 x i1> %p0, <vscale x 4 x i32> %v)
%trunc = trunc i64 %reduce to i8
@@ -316,12 +340,17 @@ entry:
}
define void @test_str_lane_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane_s8_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z0.b, z0.b[7]
-; CHECK-NEXT: stur b0, [x0, #-8]
-; CHECK-NEXT: ret
-
+; CHECK-NONSTREAMING-LABEL: test_str_lane_s8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[7]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.b, z0.b[7]
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 7
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
@@ -330,11 +359,16 @@ entry:
}
define void @test_str_lane0_s8_negative_offset(ptr %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: test_str_lane0_s8_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stur b0, [x0, #-8]
-; CHECK-NEXT: ret
-
+; CHECK-NONSTREAMING-LABEL: test_str_lane0_s8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.b[0], v0.b[0]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_lane0_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 16 x i8> %b, i32 0
%out_ptr = getelementptr inbounds i8, ptr %a, i64 -8
@@ -398,12 +432,17 @@ entry:
define void @test_str_trunc_lane_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane_s32_to_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z0.s, z0.s[3]
-; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: ret
-
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: str b0, [x0]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 3
%trunc = trunc i32 %0 to i8
@@ -412,10 +451,16 @@ entry:
}
define void @test_str_trunc_lane0_s32_to_s8(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane0_s32_to_s8:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[0]
+; CHECK-NONSTREAMING-NEXT: str b0, [x0]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s8:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: str b0, [x0]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 0
@@ -468,12 +513,17 @@ entry:
}
define void @test_str_trunc_lane_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z0.s, z0.s[3]
-; CHECK-NEXT: stur b0, [x0, #-8]
-; CHECK-NEXT: ret
-
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[3]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane_s32_to_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: mov z0.s, z0.s[3]
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 3
%trunc = trunc i32 %0 to i8
@@ -483,10 +533,16 @@ entry:
}
define void @test_str_trunc_lane0_s32_to_s8_negative_offset(ptr %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stur b0, [x0, #-8]
-; CHECK-NEXT: ret
+; CHECK-NONSTREAMING-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
+; CHECK-NONSTREAMING: // %bb.0: // %entry
+; CHECK-NONSTREAMING-NEXT: mov v0.s[0], v0.s[0]
+; CHECK-NONSTREAMING-NEXT: stur b0, [x0, #-8]
+; CHECK-NONSTREAMING-NEXT: ret
+;
+; STREAMING-COMPAT-LABEL: test_str_trunc_lane0_s32_to_s8_negative_offset:
+; STREAMING-COMPAT: // %bb.0: // %entry
+; STREAMING-COMPAT-NEXT: stur b0, [x0, #-8]
+; STREAMING-COMPAT-NEXT: ret
entry:
%0 = extractelement <vscale x 4 x i32> %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index cdde359d09d7b..ea5dbc03ca174 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -63,9 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -100,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 03c7bad9efc22..adfcf26f85ba4 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -183,9 +183,9 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: and_v2i8:
@@ -219,9 +219,9 @@ define void @or_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: or_v2i8:
@@ -255,9 +255,9 @@ define void @xor_v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: xor_v2i8:
@@ -292,11 +292,11 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -340,11 +340,11 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
@@ -388,11 +388,11 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 7f2bebf584d8f..246fbbdb80715 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -612,7 +612,6 @@ define <1 x i8> @getL() {
; CHECK-LABEL: _setL
; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L at GOTPAGE
-; CHECK-NEXT: ; kill
; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _L at GOTPAGEOFF]
; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index 7d87be0ce8e1c..2ad567a79d6a5 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -483,10 +483,16 @@ entry:
}
define void @test_vst1q_lane_s8(ptr %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_lane_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: st1 { v0.b }[15], [x0]
-; CHECK-NEXT: ret
+; CHECK-GI-LABEL: test_vst1q_lane_s8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov v0.b[0], v0.b[15]
+; CHECK-GI-NEXT: str b0, [x0]
+; CHECK-GI-NEXT: ret
+;
+; CHECK-SD-LABEL: test_vst1q_lane_s8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: st1 { v0.b }[15], [x0]
+; CHECK-SD-NEXT: ret
entry:
%0 = extractelement <16 x i8> %b, i32 15
store i8 %0, ptr %a, align 1
@@ -604,11 +610,18 @@ entry:
}
define void @test_vst1_lane_s8(ptr %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_lane_s8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: st1 { v0.b }[7], [x0]
-; CHECK-NEXT: ret
+; CHECK-GI-LABEL: test_vst1_lane_s8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov v0.b[0], v0.b[7]
+; CHECK-GI-NEXT: str b0, [x0]
+; CHECK-GI-NEXT: ret
+;
+; CHECK-SD-LABEL: test_vst1_lane_s8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: st1 { v0.b }[7], [x0]
+; CHECK-SD-NEXT: ret
entry:
%0 = extractelement <8 x i8> %b, i32 7
store i8 %0, ptr %a, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll
index c63d66c4e7706..259d26590527c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-st1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll
@@ -4,8 +4,12 @@
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m3 | FileCheck --check-prefix=EXYNOS %s
define void @st1lane_16b(<16 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane_16b
-; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
+; SD-CHECK-LABEL: st1lane_16b
+; SD-CHECK: mov.b v0[0], v0[1]
+; SD-CHECK: stur b0, [x{{[0-9]+}}, #1]
+
+; GI-CHECK-LABEL: st1lane_16b
+; GI-CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i8, ptr %D, i64 1
%tmp = extractelement <16 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -14,7 +18,7 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) {
define void @st1lane0_16b(<16 x i8> %A, ptr %D) {
; SD-CHECK-LABEL: st1lane0_16b
-; SD-CHECK: str b0, [x{{[0-9]+}}, #1]
+; SD-CHECK: stur b0, [x{{[0-9]+}}, #1]
; GI-CHECK-LABEL: st1lane0_16b
; GI-CHECK: st1.b { v0 }[0], [x{{[0-9]+}}]
@@ -37,9 +41,13 @@ define void @st1lane0u_16b(<16 x i8> %A, ptr %D) {
}
define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_16b
-; CHECK: add x[[XREG:[0-9]+]], x0, x1
-; CHECK: st1.b { v0 }[1], [x[[XREG]]]
+; SD-CHECK-LABEL: st1lane_ro_16b
+; SD-CHECK: mov.b v0[0], v0[1]
+; SD-CHECK: str b0, [x0, x1]
+
+; GI-CHECK-LABEL: st1lane_ro_16b
+; GI-CHECK: add x[[XREG:[0-9]+]], x0, x1
+; GI-CHECK: st1.b { v0 }[1], [x[[XREG]]]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <16 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -290,8 +298,12 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) {
}
define void @st1lane_8b(<8 x i8> %A, ptr %D) {
-; CHECK-LABEL: st1lane_8b
-; CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
+; SD-CHECK-LABEL: st1lane_8b
+; SD-CHECK: mov.b v0[0], v0[1]
+; SD-CHECK: stur b0, [x0, #1]
+
+; GI-CHECK-LABEL: st1lane_8b
+; GI-CHECK: st1.b { v0 }[1], [x{{[0-9]+}}]
%ptr = getelementptr i8, ptr %D, i64 1
%tmp = extractelement <8 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
@@ -299,9 +311,13 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) {
}
define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) {
-; CHECK-LABEL: st1lane_ro_8b
-; CHECK: add x[[XREG:[0-9]+]], x0, x1
-; CHECK: st1.b { v0 }[1], [x[[XREG]]]
+; SD-CHECK-LABEL: st1lane_ro_8b
+; SD-CHECK: mov.b v0[0], v0[1]
+; SD-CHECK: str b0, [x0, x1]
+
+; GI-CHECK-LABEL: st1lane_ro_8b
+; GI-CHECK: add x[[XREG:[0-9]+]], x0, x1
+; GI-CHECK: st1.b { v0 }[1], [x[[XREG]]]
%ptr = getelementptr i8, ptr %D, i64 %offset
%tmp = extractelement <8 x i8> %A, i32 1
store i8 %tmp, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
index 77304aef4385e..05f66e4b03ed2 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-v2i8.ll
@@ -1,13 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s
; Part of PR21549: going through the stack isn't ideal but is correct.
define i16 @test_bitcast_v2i8_to_i16(<2 x i8> %a) {
-; CHECK-LABEL: test_bitcast_v2i8_to_i16
-; CHECK: mov.s [[WREG_HI:w[0-9]+]], v0[1]
-; CHECK-NEXT: strb [[WREG_HI]], [sp, #15]
-; CHECK-NEXT: str [[WREG_LO:b[0-9]+]], [sp, #14]
-; CHECK-NEXT: ldrh w0, [sp, #14]
+; CHECK-LABEL: test_bitcast_v2i8_to_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov.s v1[0], v0[1]
+; CHECK-NEXT: str b0, [sp, #14]
+; CHECK-NEXT: stur b1, [sp, #15]
+; CHECK-NEXT: ldrh w0, [sp, #14]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
%aa = bitcast <2 x i8> %a to i16
ret i16 %aa
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 79676efebe776..f795050e568e6 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -13,9 +13,9 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: mov v1.s[1], w9
; CHECK-SD-NEXT: clz v1.2s, v1.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -48,11 +48,11 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: clz v1.4h, v1.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index 767b9d28d6215..d9cbac7a4c691 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -13,9 +13,9 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -47,11 +47,11 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: cnt v0.8b, v0.8b
; CHECK-SD-NEXT: uaddlp v0.4h, v0.8b
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index 97f5a29064c67..1d9af77eb4a05 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -16,9 +16,9 @@ define void @v2i8(ptr %p1) {
; CHECK-SD-NEXT: movi v1.2s, #32
; CHECK-SD-NEXT: clz v0.2s, v0.2s
; CHECK-SD-NEXT: sub v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -58,11 +58,11 @@ define void @v3i8(ptr %p1) {
; CHECK-SD-NEXT: clz v0.4h, v0.4h
; CHECK-SD-NEXT: sub v0.4h, v1.4h, v0.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 0d7a6a7dbcb11..0270083ad1d06 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -75,9 +75,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -112,11 +112,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index a070e3d7565ed..c501faa1c567a 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -89,9 +89,9 @@ define void @v2i32_v2i8(<2 x i32> %a, ptr %result) {
; CHECK-LABEL: v2i32_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mov v1.s[0], v0.s[1]
; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: strb w8, [x0, #1]
+; CHECK-NEXT: stur b1, [x0, #1]
; CHECK-NEXT: ret
%b = trunc <2 x i32> %a to <2 x i8>
store <2 x i8> %b, ptr %result
@@ -155,9 +155,9 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) {
; CHECK-LABEL: v2i16_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mov v1.s[0], v0.s[1]
; CHECK-NEXT: str b0, [x0]
-; CHECK-NEXT: strb w8, [x0, #1]
+; CHECK-NEXT: stur b1, [x0, #1]
; CHECK-NEXT: ret
%b = trunc <2 x i16> %a to <2 x i8>
store <2 x i8> %b, ptr %result
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index 28cff55beff9e..adb209c0c6348 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -451,7 +451,7 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) {
; CHECK-NEXT: ldnp q0, q1, [x0]
; CHECK-NEXT: ldr b2, [x0, #32]
; CHECK-NEXT: stp q0, q1, [x8]
-; CHECK-NEXT: str b2, [x8, #32]
+; CHECK-NEXT: stur b2, [x8, #32]
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v33i8:
diff --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
index 0b0540e559abd..f17570837515c 100644
--- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
+++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
@@ -11,45 +11,31 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: mov v1.b[0], v0.b[7]
+; CHECK-NEXT: mov v2.b[0], v0.b[6]
+; CHECK-NEXT: stur b0, [sp, #15]
+; CHECK-NEXT: stur b0, [sp, #14]
+; CHECK-NEXT: stur b0, [sp, #13]
+; CHECK-NEXT: stur b0, [sp, #12]
+; CHECK-NEXT: stur b1, [sp, #7]
+; CHECK-NEXT: mov v1.b[0], v0.b[5]
+; CHECK-NEXT: stur b2, [sp, #6]
+; CHECK-NEXT: mov v2.b[0], v0.b[4]
+; CHECK-NEXT: stur b0, [sp, #11]
+; CHECK-NEXT: stur b0, [sp, #10]
+; CHECK-NEXT: stur b1, [sp, #5]
+; CHECK-NEXT: mov v1.b[0], v0.b[3]
+; CHECK-NEXT: stur b0, [sp, #9]
+; CHECK-NEXT: stur b2, [sp, #4]
+; CHECK-NEXT: mov v2.b[0], v0.b[2]
; CHECK-NEXT: str b0, [sp]
-; CHECK-NEXT: orr x9, x8, #0xf
-; CHECK-NEXT: orr x10, x8, #0xe
-; CHECK-NEXT: st1 { v0.b }[15], [x9]
-; CHECK-NEXT: orr x9, x8, #0xc
-; CHECK-NEXT: st1 { v0.b }[12], [x9]
-; CHECK-NEXT: orr x9, x8, #0x8
-; CHECK-NEXT: st1 { v0.b }[8], [x9]
-; CHECK-NEXT: orr x9, x8, #0x7
-; CHECK-NEXT: st1 { v0.b }[7], [x9]
-; CHECK-NEXT: orr x9, x8, #0x6
-; CHECK-NEXT: st1 { v0.b }[6], [x9]
-; CHECK-NEXT: orr x9, x8, #0x4
-; CHECK-NEXT: st1 { v0.b }[4], [x9]
-; CHECK-NEXT: orr x9, x8, #0x3
-; CHECK-NEXT: st1 { v0.b }[3], [x9]
-; CHECK-NEXT: orr x9, x8, #0x2
-; CHECK-NEXT: st1 { v0.b }[14], [x10]
-; CHECK-NEXT: mov w10, #13 // =0xd
-; CHECK-NEXT: st1 { v0.b }[2], [x9]
-; CHECK-NEXT: orr x9, x8, #0x1
-; CHECK-NEXT: st1 { v0.b }[1], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #11 // =0xb
-; CHECK-NEXT: st1 { v0.b }[13], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #10 // =0xa
-; CHECK-NEXT: st1 { v0.b }[11], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #9 // =0x9
-; CHECK-NEXT: st1 { v0.b }[10], [x9]
-; CHECK-NEXT: orr x9, x8, x10
-; CHECK-NEXT: mov w10, #5 // =0x5
-; CHECK-NEXT: orr x8, x8, x10
-; CHECK-NEXT: st1 { v0.b }[9], [x9]
-; CHECK-NEXT: st1 { v0.b }[5], [x8]
+; CHECK-NEXT: mov v0.b[0], v0.b[1]
+; CHECK-NEXT: stur b1, [sp, #3]
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: stur b2, [sp, #2]
+; CHECK-NEXT: stur b0, [sp, #8]
+; CHECK-NEXT: stur b0, [sp, #1]
; CHECK-NEXT: ldr q0, [sp]
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: add sp, sp, #16
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index cbb3b06030bae..18457d2b27781 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -200,9 +200,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqadd v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 04b379f455008..257d2a1c1ebda 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -201,9 +201,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: shl v0.2s, v0.2s, #24
; CHECK-SD-NEXT: sqsub v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: ushr v0.2s, v0.2s, #24
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 7ea957d9d165d..296b860be2a76 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -110,9 +110,9 @@ define void @store_v2i8(<2 x i8> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v2i8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v2i8:
@@ -230,12 +230,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
define void @store_v7i8(<7 x i8> %a, ptr %ptr){
; CHECK-SD-LABEL: store_v7i8:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: add x8, x0, #6
-; CHECK-SD-NEXT: add x9, x0, #4
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mov v1.b[0], v0.b[6]
+; CHECK-SD-NEXT: add x8, x0, #4
; CHECK-SD-NEXT: str s0, [x0]
-; CHECK-SD-NEXT: st1 { v0.b }[6], [x8]
-; CHECK-SD-NEXT: st1 { v0.h }[2], [x9]
+; CHECK-SD-NEXT: st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT: stur b1, [x0, #6]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: store_v7i8:
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 91a17a89af6e1..c3cc6169f3969 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -63,9 +63,9 @@ define void @v2i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: sub v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x0]
-; CHECK-SD-NEXT: strb w8, [x0, #1]
+; CHECK-SD-NEXT: stur b1, [x0, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
@@ -100,11 +100,11 @@ define void @v3i8(ptr %p1, ptr %p2) {
; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v0.8b
; CHECK-SD-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT: umov w8, v0.h[2]
+; CHECK-SD-NEXT: mov v0.h[0], v0.h[2]
; CHECK-SD-NEXT: str s1, [sp, #12]
-; CHECK-SD-NEXT: ldrh w9, [sp, #12]
-; CHECK-SD-NEXT: strb w8, [x0, #2]
-; CHECK-SD-NEXT: strh w9, [x0]
+; CHECK-SD-NEXT: ldrh w8, [sp, #12]
+; CHECK-SD-NEXT: stur b0, [x0, #2]
+; CHECK-SD-NEXT: strh w8, [x0]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 89a06bc9d5b4e..27aa5019fb259 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -73,7 +73,7 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: zip1 z1.s, z1.s, z0.s
; CHECK-NEXT: st1b { z1.h }, p0, [x8]
; CHECK-NEXT: ld1h { z1.s }, p1/z, [x8]
-; CHECK-NEXT: str b0, [x19, #2]
+; CHECK-NEXT: stur b0, [x19, #2]
; CHECK-NEXT: str h1, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #32
@@ -122,7 +122,7 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
; CHECK-NEXT: ldr q1, [sp, #16]
-; CHECK-NEXT: str b1, [x19, #8]
+; CHECK-NEXT: stur b1, [x19, #8]
; CHECK-NEXT: str d0, [x19]
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 184e8fff154b9..f2389b3e94846 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -706,7 +706,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q4, q0, [x0, #48]
-; CHECK-NEXT: add x9, x1, #10
+; CHECK-NEXT: add x9, x1, #8
; CHECK-NEXT: ldr d1, [x0, #80]
; CHECK-NEXT: ldp q3, q2, [x0]
; CHECK-NEXT: ldr q5, [x0, #32]
@@ -719,10 +719,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: uzp1.8h v1, v2, v1
; CHECK-NEXT: uzp1.8b v2, v0, v0
; CHECK-NEXT: uzp1.16b v0, v1, v0
-; CHECK-NEXT: st1.b { v2 }[2], [x9]
-; CHECK-NEXT: add x9, x1, #8
+; CHECK-NEXT: mov.b v1[0], v2[2]
+; CHECK-NEXT: str d0, [x1]
; CHECK-NEXT: st1.h { v0 }[4], [x9]
-; CHECK-NEXT: str d0, [x1], #16
+; CHECK-NEXT: stur b1, [x1, #10]
+; CHECK-NEXT: add x1, x1, #16
; CHECK-NEXT: b.eq LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
@@ -742,7 +743,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: ld1 { v2.2d }, [x9]
; CHECK-BE-NEXT: ldr d5, [x0, #80]
; CHECK-BE-NEXT: ld1 { v4.2d }, [x10]
-; CHECK-BE-NEXT: add x9, x1, #10
+; CHECK-BE-NEXT: add x9, x1, #8
; CHECK-BE-NEXT: subs x8, x8, #1
; CHECK-BE-NEXT: uzp1 v1.4s, v3.4s, v1.4s
; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v5.4s
@@ -754,10 +755,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-BE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-BE-NEXT: rev16 v2.16b, v1.16b
; CHECK-BE-NEXT: rev64 v1.16b, v1.16b
-; CHECK-BE-NEXT: st1 { v0.b }[2], [x9]
-; CHECK-BE-NEXT: add x9, x1, #8
+; CHECK-BE-NEXT: mov v0.b[0], v0.b[2]
+; CHECK-BE-NEXT: str d1, [x1]
+; CHECK-BE-NEXT: stur b0, [x1, #10]
+; CHECK-BE-NEXT: add x1, x1, #16
; CHECK-BE-NEXT: st1 { v2.h }[4], [x9]
-; CHECK-BE-NEXT: str d1, [x1], #16
; CHECK-BE-NEXT: b.eq .LBB6_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
@@ -777,7 +779,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: ld1 { v2.2d }, [x9]
; CHECK-DISABLE-NEXT: ldr d5, [x0, #80]
; CHECK-DISABLE-NEXT: ld1 { v4.2d }, [x10]
-; CHECK-DISABLE-NEXT: add x9, x1, #10
+; CHECK-DISABLE-NEXT: add x9, x1, #8
; CHECK-DISABLE-NEXT: subs x8, x8, #1
; CHECK-DISABLE-NEXT: uzp1 v1.4s, v3.4s, v1.4s
; CHECK-DISABLE-NEXT: uzp1 v0.4s, v0.4s, v5.4s
@@ -789,10 +791,11 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
; CHECK-DISABLE-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-DISABLE-NEXT: rev16 v2.16b, v1.16b
; CHECK-DISABLE-NEXT: rev64 v1.16b, v1.16b
-; CHECK-DISABLE-NEXT: st1 { v0.b }[2], [x9]
-; CHECK-DISABLE-NEXT: add x9, x1, #8
+; CHECK-DISABLE-NEXT: mov v0.b[0], v0.b[2]
+; CHECK-DISABLE-NEXT: str d1, [x1]
+; CHECK-DISABLE-NEXT: stur b0, [x1, #10]
+; CHECK-DISABLE-NEXT: add x1, x1, #16
; CHECK-DISABLE-NEXT: st1 { v2.h }[4], [x9]
-; CHECK-DISABLE-NEXT: str d1, [x1], #16
; CHECK-DISABLE-NEXT: b.eq .LBB6_1
; CHECK-DISABLE-NEXT: // %bb.2: // %exit
; CHECK-DISABLE-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index edd96ae4836a4..19178964710cd 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -198,9 +198,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: umin v0.2s, v0.2s, v2.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 63ca1b51c2291..443bd46bb71da 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -197,9 +197,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
; CHECK-SD-NEXT: mov v0.s[1], w10
; CHECK-SD-NEXT: mov v1.s[1], w11
; CHECK-SD-NEXT: uqsub v0.2s, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov w8, v0.s[1]
+; CHECK-SD-NEXT: mov v1.s[0], v0.s[1]
; CHECK-SD-NEXT: str b0, [x2]
-; CHECK-SD-NEXT: strb w8, [x2, #1]
+; CHECK-SD-NEXT: stur b1, [x2, #1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i8:
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index d9b5a42ba98a6..3c42079dc8d8a 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -35,8 +35,7 @@ define void @store_8_elements(<8 x i16> %vec, ptr %out) {
; CHECK-NEXT: ldr q1, [x8, lCPI1_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addv.8h h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
@@ -76,8 +75,7 @@ define void @store_2_elements(<2 x i64> %vec, ptr %out) {
; CHECK-NEXT: ldr q1, [x8, lCPI3_0 at PAGEOFF]
; CHECK-NEXT: bic.16b v0, v1, v0
; CHECK-NEXT: addp.2d d0, v0
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
@@ -119,8 +117,7 @@ define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) {
; CHECK-NEXT: cmlt.4h v0, v0, #0
; CHECK-NEXT: and.8b v0, v0, v1
; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh11
@@ -159,8 +156,7 @@ define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) {
; CHECK-NEXT: ldr d1, [x8, lCPI7_0 at PAGEOFF]
; CHECK-NEXT: bic.8b v0, v1, v0
; CHECK-NEXT: addv.4h h0, v0
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh15
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 8ab8f537398ae..b87dc25478b78 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -355,14 +355,14 @@ define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) {
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: store_trunc_from_64bits:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: add x8, x0, #4
+; CHECK-NEXT: ld1r.4h { v0 }, [x8]
; CHECK-NEXT: ldr w8, [x0]
-; CHECK-NEXT: add x9, x0, #4
-; CHECK-NEXT: ld1r.4h { v0 }, [x9]
; CHECK-NEXT: lsr w9, w8, #16
; CHECK-NEXT: strb w8, [x1]
-; CHECK-NEXT: add x8, x1, #2
+; CHECK-NEXT: mov.b v0[0], v0[4]
; CHECK-NEXT: strb w9, [x1, #1]
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
+; CHECK-NEXT: stur b0, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: store_trunc_from_64bits:
@@ -397,13 +397,13 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; CHECK-NEXT: adrp x8, lCPI11_0 at PAGE
; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr d1, [x8, lCPI11_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x1, #1
; CHECK-NEXT: ld1.h { v0 }[2], [x9]
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: add.4h v0, v0, v1
-; CHECK-NEXT: st1.b { v0 }[2], [x8]
-; CHECK-NEXT: st1.b { v0 }[4], [x9]
+; CHECK-NEXT: mov.b v1[0], v0[2]
+; CHECK-NEXT: mov.b v2[0], v0[4]
; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
;
@@ -420,12 +420,12 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v1.4h }, [x8]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
entry:
@@ -587,12 +587,12 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
+; CHECK-NEXT: mov.b v1[0], v0[4]
+; CHECK-NEXT: mov.b v2[0], v0[8]
; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store:
@@ -602,12 +602,12 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -621,12 +621,12 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_default_align:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
+; CHECK-NEXT: mov.b v1[0], v0[4]
+; CHECK-NEXT: mov.b v2[0], v0[8]
; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_default_align:
@@ -636,12 +636,12 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -655,12 +655,12 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_align_4:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #1
-; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
+; CHECK-NEXT: mov.b v1[0], v0[4]
+; CHECK-NEXT: mov.b v2[0], v0[8]
; CHECK-NEXT: str b0, [x1]
+; CHECK-NEXT: stur b1, [x1, #1]
+; CHECK-NEXT: stur b2, [x1, #2]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_align_4:
@@ -670,12 +670,12 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -689,12 +689,12 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_const_offset_1:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #2
-; CHECK-NEXT: add x9, x1, #3
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: str b0, [x1, #1]
+; CHECK-NEXT: mov.b v1[0], v0[4]
+; CHECK-NEXT: mov.b v2[0], v0[8]
+; CHECK-NEXT: stur b0, [x1, #1]
+; CHECK-NEXT: stur b1, [x1, #2]
+; CHECK-NEXT: stur b2, [x1, #3]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_const_offset_1:
@@ -704,12 +704,12 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #3]
-; BE-NEXT: sturh w9, [x1, #1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #3]
+; BE-NEXT: sturh w8, [x1, #1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -724,12 +724,12 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store_const_offset_3:
; CHECK: ; %bb.0:
; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: add x8, x1, #4
-; CHECK-NEXT: add x9, x1, #5
; CHECK-NEXT: ushr.4s v0, v0, #16
-; CHECK-NEXT: st1.b { v0 }[4], [x8]
-; CHECK-NEXT: st1.b { v0 }[8], [x9]
-; CHECK-NEXT: str b0, [x1, #3]
+; CHECK-NEXT: mov.b v1[0], v0[4]
+; CHECK-NEXT: mov.b v2[0], v0[8]
+; CHECK-NEXT: stur b0, [x1, #3]
+; CHECK-NEXT: stur b1, [x1, #4]
+; CHECK-NEXT: stur b2, [x1, #5]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store_const_offset_3:
@@ -739,12 +739,12 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #5]
-; BE-NEXT: sturh w9, [x1, #3]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #5]
+; BE-NEXT: sturh w8, [x1, #3]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -763,11 +763,11 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: shrn.4h v0, v0, #16
; CHECK-NEXT: uzp1.8b v1, v0, v0
-; CHECK-NEXT: umov.h w8, v0[2]
+; CHECK-NEXT: mov.h v0[0], v0[2]
; CHECK-NEXT: str s1, [sp, #12]
-; CHECK-NEXT: ldrh w9, [sp, #12]
-; CHECK-NEXT: strb w8, [x1, #2]
-; CHECK-NEXT: strh w9, [x1]
+; CHECK-NEXT: ldrh w8, [sp, #12]
+; CHECK-NEXT: stur b0, [x1, #2]
+; CHECK-NEXT: strh w8, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
;
@@ -778,12 +778,12 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
; BE-NEXT: ld1 { v0.4s }, [x0]
; BE-NEXT: shrn v0.4h, v0.4s, #16
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #12]
-; BE-NEXT: ldrh w9, [sp, #12]
-; BE-NEXT: strb w8, [x1, #2]
-; BE-NEXT: strh w9, [x1]
+; BE-NEXT: ldrh w8, [sp, #12]
+; BE-NEXT: stur b0, [x1, #2]
+; BE-NEXT: strh w8, [x1]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i32>, ptr %src
@@ -802,15 +802,15 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-NEXT: adrp x8, lCPI22_0 at PAGE
; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q1, [x8, lCPI22_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
-; CHECK-NEXT: add x9, x0, #1
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
-; CHECK-NEXT: st1.b { v0 }[8], [x8]
-; CHECK-NEXT: st1.b { v0 }[4], [x9]
+; CHECK-NEXT: mov.b v1[0], v0[8]
+; CHECK-NEXT: mov.b v2[0], v0[4]
; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: stur b1, [x0, #2]
+; CHECK-NEXT: stur b2, [x0, #1]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
;
@@ -830,12 +830,12 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w9, [sp, #8]
-; BE-NEXT: strb w8, [x0, #2]
-; BE-NEXT: strh w9, [x0]
+; BE-NEXT: ldrh w8, [sp, #8]
+; BE-NEXT: stur b0, [x0, #2]
+; BE-NEXT: strh w8, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
@@ -855,15 +855,15 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; CHECK-NEXT: adrp x8, lCPI23_0 at PAGE
; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q1, [x8, lCPI23_0 at PAGEOFF]
-; CHECK-NEXT: add x8, x0, #2
; CHECK-NEXT: orr w9, w10, w9, lsl #16
; CHECK-NEXT: fmov s0, w9
-; CHECK-NEXT: add x9, x0, #1
; CHECK-NEXT: zip1.8b v0, v0, v0
; CHECK-NEXT: uaddw.4s v0, v1, v0
-; CHECK-NEXT: st1.b { v0 }[8], [x8]
-; CHECK-NEXT: st1.b { v0 }[4], [x9]
+; CHECK-NEXT: mov.b v1[0], v0[8]
+; CHECK-NEXT: mov.b v2[0], v0[4]
; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: stur b1, [x0, #2]
+; CHECK-NEXT: stur b2, [x0, #1]
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
;
@@ -883,12 +883,12 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
; BE-NEXT: ld1 { v0.b }[4], [x9]
; BE-NEXT: add v0.4h, v0.4h, v1.4h
; BE-NEXT: uzp1 v1.8b, v0.8b, v0.8b
-; BE-NEXT: umov w8, v0.h[2]
+; BE-NEXT: mov v0.h[0], v0.h[2]
; BE-NEXT: rev32 v1.16b, v1.16b
; BE-NEXT: str s1, [sp, #8]
-; BE-NEXT: ldrh w9, [sp, #8]
-; BE-NEXT: strb w8, [x0, #2]
-; BE-NEXT: strh w9, [x0]
+; BE-NEXT: ldrh w8, [sp, #8]
+; BE-NEXT: stur b0, [x0, #2]
+; BE-NEXT: strh w8, [x0]
; BE-NEXT: add sp, sp, #16
; BE-NEXT: ret
%l = load <3 x i8>, ptr %src, align 1
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 37c6374215d81..09662aef7e423 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -249,15 +249,14 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: adrp x8, .LCPI10_0
; CHECK-NEXT: shl v1.4h, v2.4h, #15
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: addv h1, v1.4h
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: str b1, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index 3a481efd9785a..7803c095b77c2 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -299,11 +299,10 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: addv h1, v0.4h
+; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: addv h1, v1.4h
+; CHECK-NEXT: str b1, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index f990bdc2e5615..0979a80f7f22e 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -107,94 +107,109 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: shl.16b v1, v1, #7
-; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: mov.b v2[0], v0[1]
+; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: str b0, [sp]
-; CHECK-NEXT: mov x13, sp
+; CHECK-NEXT: mov.b v3[0], v0[2]
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: mov.b v4[0], v0[3]
+; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: cmlt.16b v1, v1, #0
-; CHECK-NEXT: umov.b w9, v1[0]
-; CHECK-NEXT: umov.b w10, v1[1]
-; CHECK-NEXT: umov.b w11, v1[2]
+; CHECK-NEXT: umov.b w11, v1[0]
+; CHECK-NEXT: umov.b w12, v1[1]
+; CHECK-NEXT: umov.b w13, v1[2]
; CHECK-NEXT: umov.b w14, v1[3]
-; CHECK-NEXT: bfxil x12, x9, #0, #1
-; CHECK-NEXT: and x10, x10, #0x1
-; CHECK-NEXT: and x9, x9, #0x1
-; CHECK-NEXT: add x9, x9, x10
-; CHECK-NEXT: umov.b w10, v1[4]
+; CHECK-NEXT: bfxil x10, x11, #0, #1
; CHECK-NEXT: and x11, x11, #0x1
-; CHECK-NEXT: st1.b { v0 }[1], [x12]
-; CHECK-NEXT: orr x12, x8, x9
-; CHECK-NEXT: add x9, x9, x11
-; CHECK-NEXT: umov.b w11, v1[5]
+; CHECK-NEXT: and x13, x13, #0x1
; CHECK-NEXT: and x14, x14, #0x1
-; CHECK-NEXT: st1.b { v0 }[2], [x12]
-; CHECK-NEXT: add x14, x9, x14
-; CHECK-NEXT: umov.b w12, v1[6]
-; CHECK-NEXT: orr x9, x8, x9
-; CHECK-NEXT: and x10, x10, #0x1
-; CHECK-NEXT: st1.b { v0 }[3], [x9]
-; CHECK-NEXT: orr x9, x8, x14
-; CHECK-NEXT: add x10, x14, x10
-; CHECK-NEXT: umov.b w14, v1[7]
-; CHECK-NEXT: st1.b { v0 }[4], [x9]
-; CHECK-NEXT: and x11, x11, #0x1
-; CHECK-NEXT: bfxil x13, x10, #0, #4
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: add x10, x10, x11
-; CHECK-NEXT: umov.b w11, v1[8]
+; CHECK-NEXT: str b2, [x10]
+; CHECK-NEXT: and x10, x12, #0x1
+; CHECK-NEXT: umov.b w12, v1[4]
+; CHECK-NEXT: mov.b v2[0], v0[4]
+; CHECK-NEXT: add x10, x11, x10
+; CHECK-NEXT: umov.b w11, v1[5]
+; CHECK-NEXT: add x13, x10, x13
+; CHECK-NEXT: orr x10, x8, x10
+; CHECK-NEXT: str b3, [x10]
+; CHECK-NEXT: orr x10, x8, x13
+; CHECK-NEXT: add x13, x13, x14
; CHECK-NEXT: and x12, x12, #0x1
-; CHECK-NEXT: bfxil x9, x10, #0, #4
-; CHECK-NEXT: st1.b { v0 }[5], [x13]
-; CHECK-NEXT: umov.b w13, v1[9]
-; CHECK-NEXT: add x10, x10, x12
-; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: umov.b w14, v1[6]
+; CHECK-NEXT: str b4, [x10]
+; CHECK-NEXT: add x12, x13, x12
+; CHECK-NEXT: orr x13, x8, x13
+; CHECK-NEXT: mov.b v3[0], v0[5]
+; CHECK-NEXT: str b2, [x13]
+; CHECK-NEXT: umov.b w13, v1[7]
+; CHECK-NEXT: and x11, x11, #0x1
+; CHECK-NEXT: bfxil x9, x12, #0, #4
+; CHECK-NEXT: add x11, x12, x11
+; CHECK-NEXT: umov.b w12, v1[8]
+; CHECK-NEXT: mov.b v4[0], v0[6]
+; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: and x14, x14, #0x1
-; CHECK-NEXT: st1.b { v0 }[6], [x9]
-; CHECK-NEXT: umov.b w9, v1[10]
-; CHECK-NEXT: bfxil x12, x10, #0, #4
-; CHECK-NEXT: add x10, x10, x14
+; CHECK-NEXT: mov.b v2[0], v0[7]
+; CHECK-NEXT: bfxil x10, x11, #0, #4
+; CHECK-NEXT: add x11, x11, x14
; CHECK-NEXT: mov x14, sp
-; CHECK-NEXT: and x11, x11, #0x1
-; CHECK-NEXT: bfxil x14, x10, #0, #4
-; CHECK-NEXT: add x10, x10, x11
-; CHECK-NEXT: mov x11, sp
+; CHECK-NEXT: str b3, [x9]
+; CHECK-NEXT: umov.b w9, v1[9]
; CHECK-NEXT: and x13, x13, #0x1
-; CHECK-NEXT: st1.b { v0 }[7], [x12]
-; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: bfxil x11, x10, #0, #4
-; CHECK-NEXT: add x10, x10, x13
-; CHECK-NEXT: umov.b w13, v1[11]
-; CHECK-NEXT: st1.b { v0 }[8], [x14]
-; CHECK-NEXT: umov.b w14, v1[12]
+; CHECK-NEXT: mov.b v3[0], v0[8]
+; CHECK-NEXT: bfxil x14, x11, #0, #4
+; CHECK-NEXT: add x11, x11, x13
+; CHECK-NEXT: mov x13, sp
+; CHECK-NEXT: and x12, x12, #0x1
+; CHECK-NEXT: str b4, [x10]
+; CHECK-NEXT: bfxil x13, x11, #0, #4
+; CHECK-NEXT: add x10, x11, x12
+; CHECK-NEXT: umov.b w12, v1[10]
+; CHECK-NEXT: str b2, [x14]
+; CHECK-NEXT: mov.b v2[0], v0[9]
+; CHECK-NEXT: mov x11, sp
; CHECK-NEXT: and x9, x9, #0x1
-; CHECK-NEXT: bfxil x12, x10, #0, #4
+; CHECK-NEXT: str b3, [x13]
+; CHECK-NEXT: mov.b v3[0], v0[10]
+; CHECK-NEXT: umov.b w13, v1[11]
+; CHECK-NEXT: bfxil x11, x10, #0, #4
; CHECK-NEXT: add x9, x10, x9
; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: st1.b { v0 }[9], [x11]
-; CHECK-NEXT: umov.b w11, v1[13]
+; CHECK-NEXT: mov.b v4[0], v0[11]
; CHECK-NEXT: bfxil x10, x9, #0, #4
-; CHECK-NEXT: st1.b { v0 }[10], [x12]
-; CHECK-NEXT: umov.b w12, v1[14]
-; CHECK-NEXT: and x13, x13, #0x1
-; CHECK-NEXT: and x14, x14, #0x1
-; CHECK-NEXT: add x9, x9, x13
-; CHECK-NEXT: st1.b { v0 }[11], [x10]
+; CHECK-NEXT: and x12, x12, #0x1
+; CHECK-NEXT: umov.b w14, v1[12]
+; CHECK-NEXT: add x9, x9, x12
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: str b2, [x11]
+; CHECK-NEXT: umov.b w11, v1[13]
+; CHECK-NEXT: bfxil x12, x9, #0, #4
+; CHECK-NEXT: str b3, [x10]
+; CHECK-NEXT: and x10, x13, #0x1
+; CHECK-NEXT: umov.b w13, v1[14]
+; CHECK-NEXT: mov.b v1[0], v0[12]
+; CHECK-NEXT: str b4, [x12]
+; CHECK-NEXT: add x9, x9, x10
; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: add x13, x9, x14
-; CHECK-NEXT: mov x14, sp
+; CHECK-NEXT: and x12, x14, #0x1
; CHECK-NEXT: bfxil x10, x9, #0, #4
-; CHECK-NEXT: and x9, x11, #0x1
+; CHECK-NEXT: mov.b v2[0], v0[13]
+; CHECK-NEXT: add x9, x9, x12
+; CHECK-NEXT: mov x12, sp
+; CHECK-NEXT: and x11, x11, #0x1
+; CHECK-NEXT: bfxil x12, x9, #0, #4
+; CHECK-NEXT: add x9, x9, x11
+; CHECK-NEXT: mov.b v3[0], v0[14]
; CHECK-NEXT: mov x11, sp
-; CHECK-NEXT: add x9, x13, x9
-; CHECK-NEXT: and w12, w12, #0x1
-; CHECK-NEXT: bfxil x14, x13, #0, #4
+; CHECK-NEXT: and w13, w13, #0x1
+; CHECK-NEXT: mov.b v0[0], v0[15]
; CHECK-NEXT: bfxil x11, x9, #0, #4
-; CHECK-NEXT: add w9, w9, w12
-; CHECK-NEXT: st1.b { v0 }[12], [x10]
+; CHECK-NEXT: add w9, w9, w13
+; CHECK-NEXT: str b1, [x10]
; CHECK-NEXT: bfxil x8, x9, #0, #4
-; CHECK-NEXT: st1.b { v0 }[13], [x14]
-; CHECK-NEXT: st1.b { v0 }[14], [x11]
-; CHECK-NEXT: st1.b { v0 }[15], [x8]
+; CHECK-NEXT: str b2, [x12]
+; CHECK-NEXT: str b3, [x11]
+; CHECK-NEXT: str b0, [x8]
; CHECK-NEXT: ldr q0, [sp], #16
; CHECK-NEXT: ret
%out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 6536f0c355b47..e3c4fe44d201d 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -2702,28 +2702,29 @@ define void @zext_v8i8_to_v8i33_in_loop(ptr %src, ptr %dst) {
; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0
; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0
-; CHECK-BE-NEXT: mov x9, v2.d[1]
; CHECK-BE-NEXT: mov x10, v1.d[1]
+; CHECK-BE-NEXT: mov x9, v2.d[1]
; CHECK-BE-NEXT: fmov x13, d1
; CHECK-BE-NEXT: mov x11, v3.d[1]
; CHECK-BE-NEXT: mov x12, v0.d[1]
-; CHECK-BE-NEXT: fmov x14, d2
-; CHECK-BE-NEXT: fmov x15, d3
+; CHECK-BE-NEXT: mov v1.d[0], v1.d[1]
+; CHECK-BE-NEXT: orr x10, x10, x13, lsl #33
+; CHECK-BE-NEXT: fmov x13, d2
; CHECK-BE-NEXT: lsl x9, x9, #2
-; CHECK-BE-NEXT: orr x13, x10, x13, lsl #33
-; CHECK-BE-NEXT: strb w10, [x1, #32]
; CHECK-BE-NEXT: lsl x11, x11, #4
; CHECK-BE-NEXT: lsl x12, x12, #6
-; CHECK-BE-NEXT: orr x14, x9, x14, lsl #35
-; CHECK-BE-NEXT: extr x9, x9, x13, #8
+; CHECK-BE-NEXT: stur b1, [x1, #32]
+; CHECK-BE-NEXT: orr x13, x9, x13, lsl #35
+; CHECK-BE-NEXT: extr x9, x9, x10, #8
+; CHECK-BE-NEXT: fmov x10, d3
+; CHECK-BE-NEXT: orr x10, x11, x10, lsl #37
+; CHECK-BE-NEXT: extr x11, x11, x13, #8
; CHECK-BE-NEXT: fmov x13, d0
-; CHECK-BE-NEXT: orr x15, x11, x15, lsl #37
-; CHECK-BE-NEXT: extr x10, x11, x14, #8
-; CHECK-BE-NEXT: orr x11, x12, x13, lsl #39
-; CHECK-BE-NEXT: extr x12, x12, x15, #8
-; CHECK-BE-NEXT: stp x10, x9, [x1, #16]
-; CHECK-BE-NEXT: lsr x9, x11, #8
-; CHECK-BE-NEXT: stp x9, x12, [x1], #128
+; CHECK-BE-NEXT: stp x11, x9, [x1, #16]
+; CHECK-BE-NEXT: extr x9, x12, x10, #8
+; CHECK-BE-NEXT: orr x13, x12, x13, lsl #39
+; CHECK-BE-NEXT: lsr x10, x13, #8
+; CHECK-BE-NEXT: stp x10, x9, [x1], #128
; CHECK-BE-NEXT: b.ne .LBB22_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
>From 46dff6b891325d5f81e194e17fec0eb27f273d5d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 2 Apr 2025 22:24:31 +0000
Subject: [PATCH 3/3] Prefer st1.b in some cases
---
.../Target/AArch64/AArch64ISelLowering.cpp | 9 +
.../AArch64/arm64-neon-simd-ldst-one.ll | 31 +---
llvm/test/CodeGen/AArch64/vector-compress.ll | 155 ++++++++----------
3 files changed, 88 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8dbc64cebef84..96a99097d9339 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24091,6 +24091,15 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
if (MemVT == MVT::i8) {
+ auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
+ if (Subtarget->isNeonAvailable() &&
+ (VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst &&
+ !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
+ // These can lower to st1.b, which is preferable if we're unlikely to
+ // fold the addressing into the store.
+ return SDValue();
+ }
+
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
Value.getValueType(), Vector, ExtIdx);
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index 2ad567a79d6a5..7d87be0ce8e1c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -483,16 +483,10 @@ entry:
}
define void @test_vst1q_lane_s8(ptr %a, <16 x i8> %b) {
-; CHECK-GI-LABEL: test_vst1q_lane_s8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov v0.b[0], v0.b[15]
-; CHECK-GI-NEXT: str b0, [x0]
-; CHECK-GI-NEXT: ret
-;
-; CHECK-SD-LABEL: test_vst1q_lane_s8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: st1 { v0.b }[15], [x0]
-; CHECK-SD-NEXT: ret
+; CHECK-LABEL: test_vst1q_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: st1 { v0.b }[15], [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <16 x i8> %b, i32 15
store i8 %0, ptr %a, align 1
@@ -610,18 +604,11 @@ entry:
}
define void @test_vst1_lane_s8(ptr %a, <8 x i8> %b) {
-; CHECK-GI-LABEL: test_vst1_lane_s8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov v0.b[0], v0.b[7]
-; CHECK-GI-NEXT: str b0, [x0]
-; CHECK-GI-NEXT: ret
-;
-; CHECK-SD-LABEL: test_vst1_lane_s8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: st1 { v0.b }[7], [x0]
-; CHECK-SD-NEXT: ret
+; CHECK-LABEL: test_vst1_lane_s8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: st1 { v0.b }[7], [x0]
+; CHECK-NEXT: ret
entry:
%0 = extractelement <8 x i8> %b, i32 7
store i8 %0, ptr %a, align 1
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index 0979a80f7f22e..f990bdc2e5615 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -107,109 +107,94 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: shl.16b v1, v1, #7
-; CHECK-NEXT: mov.b v2[0], v0[1]
-; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: str b0, [sp]
-; CHECK-NEXT: mov.b v3[0], v0[2]
+; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: mov.b v4[0], v0[3]
-; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str b0, [sp]
+; CHECK-NEXT: mov x13, sp
; CHECK-NEXT: cmlt.16b v1, v1, #0
-; CHECK-NEXT: umov.b w11, v1[0]
-; CHECK-NEXT: umov.b w12, v1[1]
-; CHECK-NEXT: umov.b w13, v1[2]
+; CHECK-NEXT: umov.b w9, v1[0]
+; CHECK-NEXT: umov.b w10, v1[1]
+; CHECK-NEXT: umov.b w11, v1[2]
; CHECK-NEXT: umov.b w14, v1[3]
-; CHECK-NEXT: bfxil x10, x11, #0, #1
+; CHECK-NEXT: bfxil x12, x9, #0, #1
+; CHECK-NEXT: and x10, x10, #0x1
+; CHECK-NEXT: and x9, x9, #0x1
+; CHECK-NEXT: add x9, x9, x10
+; CHECK-NEXT: umov.b w10, v1[4]
; CHECK-NEXT: and x11, x11, #0x1
-; CHECK-NEXT: and x13, x13, #0x1
-; CHECK-NEXT: and x14, x14, #0x1
-; CHECK-NEXT: str b2, [x10]
-; CHECK-NEXT: and x10, x12, #0x1
-; CHECK-NEXT: umov.b w12, v1[4]
-; CHECK-NEXT: mov.b v2[0], v0[4]
-; CHECK-NEXT: add x10, x11, x10
+; CHECK-NEXT: st1.b { v0 }[1], [x12]
+; CHECK-NEXT: orr x12, x8, x9
+; CHECK-NEXT: add x9, x9, x11
; CHECK-NEXT: umov.b w11, v1[5]
-; CHECK-NEXT: add x13, x10, x13
-; CHECK-NEXT: orr x10, x8, x10
-; CHECK-NEXT: str b3, [x10]
-; CHECK-NEXT: orr x10, x8, x13
-; CHECK-NEXT: add x13, x13, x14
-; CHECK-NEXT: and x12, x12, #0x1
-; CHECK-NEXT: umov.b w14, v1[6]
-; CHECK-NEXT: str b4, [x10]
-; CHECK-NEXT: add x12, x13, x12
-; CHECK-NEXT: orr x13, x8, x13
-; CHECK-NEXT: mov.b v3[0], v0[5]
-; CHECK-NEXT: str b2, [x13]
-; CHECK-NEXT: umov.b w13, v1[7]
+; CHECK-NEXT: and x14, x14, #0x1
+; CHECK-NEXT: st1.b { v0 }[2], [x12]
+; CHECK-NEXT: add x14, x9, x14
+; CHECK-NEXT: umov.b w12, v1[6]
+; CHECK-NEXT: orr x9, x8, x9
+; CHECK-NEXT: and x10, x10, #0x1
+; CHECK-NEXT: st1.b { v0 }[3], [x9]
+; CHECK-NEXT: orr x9, x8, x14
+; CHECK-NEXT: add x10, x14, x10
+; CHECK-NEXT: umov.b w14, v1[7]
+; CHECK-NEXT: st1.b { v0 }[4], [x9]
; CHECK-NEXT: and x11, x11, #0x1
-; CHECK-NEXT: bfxil x9, x12, #0, #4
-; CHECK-NEXT: add x11, x12, x11
-; CHECK-NEXT: umov.b w12, v1[8]
-; CHECK-NEXT: mov.b v4[0], v0[6]
-; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: bfxil x13, x10, #0, #4
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: add x10, x10, x11
+; CHECK-NEXT: umov.b w11, v1[8]
+; CHECK-NEXT: and x12, x12, #0x1
+; CHECK-NEXT: bfxil x9, x10, #0, #4
+; CHECK-NEXT: st1.b { v0 }[5], [x13]
+; CHECK-NEXT: umov.b w13, v1[9]
+; CHECK-NEXT: add x10, x10, x12
+; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: and x14, x14, #0x1
-; CHECK-NEXT: mov.b v2[0], v0[7]
-; CHECK-NEXT: bfxil x10, x11, #0, #4
-; CHECK-NEXT: add x11, x11, x14
+; CHECK-NEXT: st1.b { v0 }[6], [x9]
+; CHECK-NEXT: umov.b w9, v1[10]
+; CHECK-NEXT: bfxil x12, x10, #0, #4
+; CHECK-NEXT: add x10, x10, x14
; CHECK-NEXT: mov x14, sp
-; CHECK-NEXT: str b3, [x9]
-; CHECK-NEXT: umov.b w9, v1[9]
-; CHECK-NEXT: and x13, x13, #0x1
-; CHECK-NEXT: mov.b v3[0], v0[8]
-; CHECK-NEXT: bfxil x14, x11, #0, #4
-; CHECK-NEXT: add x11, x11, x13
-; CHECK-NEXT: mov x13, sp
-; CHECK-NEXT: and x12, x12, #0x1
-; CHECK-NEXT: str b4, [x10]
-; CHECK-NEXT: bfxil x13, x11, #0, #4
-; CHECK-NEXT: add x10, x11, x12
-; CHECK-NEXT: umov.b w12, v1[10]
-; CHECK-NEXT: str b2, [x14]
-; CHECK-NEXT: mov.b v2[0], v0[9]
+; CHECK-NEXT: and x11, x11, #0x1
+; CHECK-NEXT: bfxil x14, x10, #0, #4
+; CHECK-NEXT: add x10, x10, x11
; CHECK-NEXT: mov x11, sp
-; CHECK-NEXT: and x9, x9, #0x1
-; CHECK-NEXT: str b3, [x13]
-; CHECK-NEXT: mov.b v3[0], v0[10]
-; CHECK-NEXT: umov.b w13, v1[11]
+; CHECK-NEXT: and x13, x13, #0x1
+; CHECK-NEXT: st1.b { v0 }[7], [x12]
+; CHECK-NEXT: mov x12, sp
; CHECK-NEXT: bfxil x11, x10, #0, #4
+; CHECK-NEXT: add x10, x10, x13
+; CHECK-NEXT: umov.b w13, v1[11]
+; CHECK-NEXT: st1.b { v0 }[8], [x14]
+; CHECK-NEXT: umov.b w14, v1[12]
+; CHECK-NEXT: and x9, x9, #0x1
+; CHECK-NEXT: bfxil x12, x10, #0, #4
; CHECK-NEXT: add x9, x10, x9
; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: mov.b v4[0], v0[11]
-; CHECK-NEXT: bfxil x10, x9, #0, #4
-; CHECK-NEXT: and x12, x12, #0x1
-; CHECK-NEXT: umov.b w14, v1[12]
-; CHECK-NEXT: add x9, x9, x12
-; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: str b2, [x11]
+; CHECK-NEXT: st1.b { v0 }[9], [x11]
; CHECK-NEXT: umov.b w11, v1[13]
-; CHECK-NEXT: bfxil x12, x9, #0, #4
-; CHECK-NEXT: str b3, [x10]
-; CHECK-NEXT: and x10, x13, #0x1
-; CHECK-NEXT: umov.b w13, v1[14]
-; CHECK-NEXT: mov.b v1[0], v0[12]
-; CHECK-NEXT: str b4, [x12]
-; CHECK-NEXT: add x9, x9, x10
+; CHECK-NEXT: bfxil x10, x9, #0, #4
+; CHECK-NEXT: st1.b { v0 }[10], [x12]
+; CHECK-NEXT: umov.b w12, v1[14]
+; CHECK-NEXT: and x13, x13, #0x1
+; CHECK-NEXT: and x14, x14, #0x1
+; CHECK-NEXT: add x9, x9, x13
+; CHECK-NEXT: st1.b { v0 }[11], [x10]
; CHECK-NEXT: mov x10, sp
-; CHECK-NEXT: and x12, x14, #0x1
+; CHECK-NEXT: add x13, x9, x14
+; CHECK-NEXT: mov x14, sp
; CHECK-NEXT: bfxil x10, x9, #0, #4
-; CHECK-NEXT: mov.b v2[0], v0[13]
-; CHECK-NEXT: add x9, x9, x12
-; CHECK-NEXT: mov x12, sp
-; CHECK-NEXT: and x11, x11, #0x1
-; CHECK-NEXT: bfxil x12, x9, #0, #4
-; CHECK-NEXT: add x9, x9, x11
-; CHECK-NEXT: mov.b v3[0], v0[14]
+; CHECK-NEXT: and x9, x11, #0x1
; CHECK-NEXT: mov x11, sp
-; CHECK-NEXT: and w13, w13, #0x1
-; CHECK-NEXT: mov.b v0[0], v0[15]
+; CHECK-NEXT: add x9, x13, x9
+; CHECK-NEXT: and w12, w12, #0x1
+; CHECK-NEXT: bfxil x14, x13, #0, #4
; CHECK-NEXT: bfxil x11, x9, #0, #4
-; CHECK-NEXT: add w9, w9, w13
-; CHECK-NEXT: str b1, [x10]
+; CHECK-NEXT: add w9, w9, w12
+; CHECK-NEXT: st1.b { v0 }[12], [x10]
; CHECK-NEXT: bfxil x8, x9, #0, #4
-; CHECK-NEXT: str b2, [x12]
-; CHECK-NEXT: str b3, [x11]
-; CHECK-NEXT: str b0, [x8]
+; CHECK-NEXT: st1.b { v0 }[13], [x14]
+; CHECK-NEXT: st1.b { v0 }[14], [x11]
+; CHECK-NEXT: st1.b { v0 }[15], [x8]
; CHECK-NEXT: ldr q0, [sp], #16
; CHECK-NEXT: ret
%out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
More information about the llvm-commits
mailing list