[llvm] 83bbd3f - [AArch64] Load into zero vector patterns
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 1 05:54:09 PST 2023
Author: David Green
Date: 2023-03-01T13:54:03Z
New Revision: 83bbd3fdbd75295669cf97967c38810d427c5c25
URL: https://github.com/llvm/llvm-project/commit/83bbd3fdbd75295669cf97967c38810d427c5c25
DIFF: https://github.com/llvm/llvm-project/commit/83bbd3fdbd75295669cf97967c38810d427c5c25.diff
LOG: [AArch64] Load into zero vector patterns
A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
load, 0) can use a single load. This adds tablegen patterns for both scaled and
unscaled loads, detecting where we are inserting a load into the lower element
of a zero vector.
Differential Revision: https://reviews.llvm.org/D144086
Added:
Modified:
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/load-insert-zero.ll
llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 993f6494114a..c91e7b7c64fd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3320,6 +3320,48 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
+// load, 0) can use a single load.
+multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType ScalarVT,
+ Instruction LoadInst, Instruction UnscaledLoadInst,
+ ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
+ SubRegIndex SubReg> {
+ // Scaled
+ def : Pat <(vector_insert (VT immAllZerosV),
+ (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
+ (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
+ // Unscaled
+ def : Pat <(vector_insert (VT immAllZerosV),
+ (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
+ (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
+
+ // Half-vector patterns
+ def : Pat <(vector_insert (HVT immAllZerosV),
+ (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
+ (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
+ // Unscaled
+ def : Pat <(vector_insert (HVT immAllZerosV),
+ (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
+ (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
+}
+
+defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, i32, LDRBui, LDRBui,
+ am_indexed8, am_unscaled8, uimm12s1, bsub>;
+defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, i32, LDRHui, LDURHi,
+ am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertZeroPatterns<load, v4i32, v2i32, i32, LDRSui, LDURSi,
+ am_indexed32, am_unscaled32, uimm12s4, ssub>;
+defm : LoadInsertZeroPatterns<load, v2i64, v1i64, i64, LDRDui, LDURDi,
+ am_indexed64, am_unscaled64, uimm12s8, dsub>;
+defm : LoadInsertZeroPatterns<load, v8f16, v4f16, f16, LDRHui, LDURHi,
+ am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, bf16, LDRHui, LDURHi,
+ am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertZeroPatterns<load, v4f32, v2f32, f32, LDRSui, LDURSi,
+ am_indexed32, am_unscaled32, uimm12s4, ssub>;
+defm : LoadInsertZeroPatterns<load, v2f64, v1f64, f64, LDRDui, LDURDi,
+ am_indexed64, am_unscaled64, uimm12s8, dsub>;
+
// Pre-fetch.
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
[(AArch64Prefetch timm:$Rt,
diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
index 4d28bfe2f1fb..79eaf2c5b07f 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
@@ -4,9 +4,7 @@
define <8 x i8> @loadv8i8(ptr %p) {
; CHECK-LABEL: loadv8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: ret
%l = load i8, ptr %p
%v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
@@ -16,8 +14,7 @@ define <8 x i8> @loadv8i8(ptr %p) {
define <16 x i8> @loadv16i8(ptr %p) {
; CHECK-LABEL: loadv16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.b }[0], [x0]
+; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: ret
%l = load i8, ptr %p
%v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
@@ -27,9 +24,7 @@ define <16 x i8> @loadv16i8(ptr %p) {
define <4 x i16> @loadv4i16(ptr %p) {
; CHECK-LABEL: loadv4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
%l = load i16, ptr %p
%v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
@@ -39,8 +34,7 @@ define <4 x i16> @loadv4i16(ptr %p) {
define <8 x i16> @loadv8i16(ptr %p) {
; CHECK-LABEL: loadv8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
%l = load i16, ptr %p
%v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
@@ -50,9 +44,7 @@ define <8 x i16> @loadv8i16(ptr %p) {
define <2 x i32> @loadv2i32(ptr %p) {
; CHECK-LABEL: loadv2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.s }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ret
%l = load i32, ptr %p
%v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
@@ -62,8 +54,7 @@ define <2 x i32> @loadv2i32(ptr %p) {
define <4 x i32> @loadv4i32(ptr %p) {
; CHECK-LABEL: loadv4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.s }[0], [x0]
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ret
%l = load i32, ptr %p
%v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
@@ -73,8 +64,7 @@ define <4 x i32> @loadv4i32(ptr %p) {
define <2 x i64> @loadv2i64(ptr %p) {
; CHECK-LABEL: loadv2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.d }[0], [x0]
+; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret
%l = load i64, ptr %p
%v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
@@ -85,9 +75,7 @@ define <2 x i64> @loadv2i64(ptr %p) {
define <4 x half> @loadv4f16(ptr %p) {
; CHECK-LABEL: loadv4f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
%l = load half, ptr %p
%v = insertelement <4 x half> zeroinitializer, half %l, i32 0
@@ -97,8 +85,7 @@ define <4 x half> @loadv4f16(ptr %p) {
define <8 x half> @loadv8f16(ptr %p) {
; CHECK-LABEL: loadv8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
%l = load half, ptr %p
%v = insertelement <8 x half> zeroinitializer, half %l, i32 0
@@ -108,9 +95,7 @@ define <8 x half> @loadv8f16(ptr %p) {
define <4 x bfloat> @loadv4bf16(ptr %p) {
; CHECK-LABEL: loadv4bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
%l = load bfloat, ptr %p
%v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
@@ -120,8 +105,7 @@ define <4 x bfloat> @loadv4bf16(ptr %p) {
define <8 x bfloat> @loadv8bf16(ptr %p) {
; CHECK-LABEL: loadv8bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.h }[0], [x0]
+; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
%l = load bfloat, ptr %p
%v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
@@ -131,9 +115,7 @@ define <8 x bfloat> @loadv8bf16(ptr %p) {
define <2 x float> @loadv2f32(ptr %p) {
; CHECK-LABEL: loadv2f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: ld1 { v0.s }[0], [x0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ret
%l = load float, ptr %p
%v = insertelement <2 x float> zeroinitializer, float %l, i32 0
@@ -143,8 +125,7 @@ define <2 x float> @loadv2f32(ptr %p) {
define <4 x float> @loadv4f32(ptr %p) {
; CHECK-LABEL: loadv4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.s }[0], [x0]
+; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ret
%l = load float, ptr %p
%v = insertelement <4 x float> zeroinitializer, float %l, i32 0
@@ -154,8 +135,7 @@ define <4 x float> @loadv4f32(ptr %p) {
define <2 x double> @loadv2f64(ptr %p) {
; CHECK-LABEL: loadv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.d }[0], [x0]
+; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ret
%l = load double, ptr %p
%v = insertelement <2 x double> zeroinitializer, double %l, i32 0
@@ -168,10 +148,7 @@ define <2 x double> @loadv2f64(ptr %p) {
define <8 x i8> @loadv8i8_offset(ptr %p) {
; CHECK-LABEL: loadv8i8_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.b }[0], [x8]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i8, ptr %g
@@ -182,9 +159,7 @@ define <8 x i8> @loadv8i8_offset(ptr %p) {
define <16 x i8> @loadv16i8_offset(ptr %p) {
; CHECK-LABEL: loadv16i8_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.b }[0], [x8]
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i8, ptr %g
@@ -195,10 +170,7 @@ define <16 x i8> @loadv16i8_offset(ptr %p) {
define <4 x i16> @loadv4i16_offset(ptr %p) {
; CHECK-LABEL: loadv4i16_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.h }[0], [x8]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i16, ptr %g
@@ -209,9 +181,7 @@ define <4 x i16> @loadv4i16_offset(ptr %p) {
define <8 x i16> @loadv8i16_offset(ptr %p) {
; CHECK-LABEL: loadv8i16_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.h }[0], [x8]
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i16, ptr %g
@@ -222,10 +192,7 @@ define <8 x i16> @loadv8i16_offset(ptr %p) {
define <2 x i32> @loadv2i32_offset(ptr %p) {
; CHECK-LABEL: loadv2i32_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.s }[0], [x8]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldur s0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i32, ptr %g
@@ -236,9 +203,7 @@ define <2 x i32> @loadv2i32_offset(ptr %p) {
define <4 x i32> @loadv4i32_offset(ptr %p) {
; CHECK-LABEL: loadv4i32_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.s }[0], [x8]
+; CHECK-NEXT: ldur s0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i32, ptr %g
@@ -249,9 +214,7 @@ define <4 x i32> @loadv4i32_offset(ptr %p) {
define <2 x i64> @loadv2i64_offset(ptr %p) {
; CHECK-LABEL: loadv2i64_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.d }[0], [x8]
+; CHECK-NEXT: ldur d0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load i64, ptr %g
@@ -263,10 +226,7 @@ define <2 x i64> @loadv2i64_offset(ptr %p) {
define <4 x half> @loadv4f16_offset(ptr %p) {
; CHECK-LABEL: loadv4f16_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.h }[0], [x8]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load half, ptr %g
@@ -277,9 +237,7 @@ define <4 x half> @loadv4f16_offset(ptr %p) {
define <8 x half> @loadv8f16_offset(ptr %p) {
; CHECK-LABEL: loadv8f16_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.h }[0], [x8]
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load half, ptr %g
@@ -290,10 +248,7 @@ define <8 x half> @loadv8f16_offset(ptr %p) {
define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
; CHECK-LABEL: loadv4bf16_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.h }[0], [x8]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load bfloat, ptr %g
@@ -304,9 +259,7 @@ define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
; CHECK-LABEL: loadv8bf16_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.h }[0], [x8]
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load bfloat, ptr %g
@@ -317,10 +270,7 @@ define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
define <2 x float> @loadv2f32_offset(ptr %p) {
; CHECK-LABEL: loadv2f32_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.s }[0], [x8]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ldur s0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load float, ptr %g
@@ -331,9 +281,7 @@ define <2 x float> @loadv2f32_offset(ptr %p) {
define <4 x float> @loadv4f32_offset(ptr %p) {
; CHECK-LABEL: loadv4f32_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.s }[0], [x8]
+; CHECK-NEXT: ldur s0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load float, ptr %g
@@ -344,9 +292,7 @@ define <4 x float> @loadv4f32_offset(ptr %p) {
define <2 x double> @loadv2f64_offset(ptr %p) {
; CHECK-LABEL: loadv2f64_offset:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x0, #1
-; CHECK-NEXT: ld1 { v0.d }[0], [x8]
+; CHECK-NEXT: ldur d0, [x0, #1]
; CHECK-NEXT: ret
%g = getelementptr inbounds i8, ptr %p, i64 1
%l = load double, ptr %g
@@ -360,27 +306,24 @@ define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: ldur w9, [x2, #2]
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: add x8, x2, #1
-; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.s }[0], [x2]
-; CHECK-NEXT: mov v1.s[0], w9
-; CHECK-NEXT: ld1 { v2.s }[0], [x8]
-; CHECK-NEXT: lsr w8, w9, #24
-; CHECK-NEXT: uaddl v1.8h, v1.8b, v0.8b
-; CHECK-NEXT: dup v4.8b, w8
-; CHECK-NEXT: ushll v3.8h, v2.8b, #1
+; CHECK-NEXT: ldr s1, [x2]
; CHECK-NEXT: lsl x8, x1, #1
-; CHECK-NEXT: urhadd v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: ldur s2, [x2, #1]
+; CHECK-NEXT: mov v0.s[0], w9
+; CHECK-NEXT: lsr w9, w9, #24
+; CHECK-NEXT: ushll v3.8h, v2.8b, #1
+; CHECK-NEXT: dup v4.8b, w9
; CHECK-NEXT: add x9, x8, x1
-; CHECK-NEXT: add v1.8h, v1.8h, v3.8h
-; CHECK-NEXT: zip1 v2.2s, v0.2s, v4.2s
-; CHECK-NEXT: str s0, [x0]
-; CHECK-NEXT: rshrn v1.8b, v1.8h, #2
-; CHECK-NEXT: str s1, [x0, x1]
-; CHECK-NEXT: zip1 v3.2s, v1.2s, v4.2s
-; CHECK-NEXT: ext v0.8b, v2.8b, v0.8b, #1
-; CHECK-NEXT: str s0, [x0, x8]
+; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b
+; CHECK-NEXT: str s1, [x0]
+; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
+; CHECK-NEXT: zip1 v2.2s, v1.2s, v4.2s
+; CHECK-NEXT: rshrn v0.8b, v0.8h, #2
+; CHECK-NEXT: str s0, [x0, x1]
+; CHECK-NEXT: zip1 v3.2s, v0.2s, v4.2s
+; CHECK-NEXT: ext v1.8b, v2.8b, v0.8b, #1
+; CHECK-NEXT: str s1, [x0, x8]
; CHECK-NEXT: ext v2.8b, v3.8b, v0.8b, #1
; CHECK-NEXT: str s2, [x0, x9]
; CHECK-NEXT: ret
@@ -437,32 +380,25 @@ define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef
define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
; CHECK-LABEL: predictor_4x4_neon_new:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x8, x2, #1
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: add x9, x2, #2
-; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: add x10, x2, #3
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v0.s }[0], [x2]
-; CHECK-NEXT: ld1 { v1.s }[0], [x8]
+; CHECK-NEXT: ldr s0, [x2]
; CHECK-NEXT: lsl x8, x1, #1
-; CHECK-NEXT: ld1 { v2.s }[0], [x9]
+; CHECK-NEXT: ldur s1, [x2, #1]
; CHECK-NEXT: add x9, x8, x1
-; CHECK-NEXT: ld1 { v3.s }[0], [x10]
+; CHECK-NEXT: ldur s2, [x2, #2]
+; CHECK-NEXT: ldur s3, [x2, #3]
; CHECK-NEXT: uaddl v4.8h, v1.8b, v0.8b
; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
; CHECK-NEXT: uaddl v5.8h, v2.8b, v1.8b
; CHECK-NEXT: uaddl v3.8h, v3.8b, v2.8b
; CHECK-NEXT: str s0, [x0]
-; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b
; CHECK-NEXT: add v4.8h, v4.8h, v5.8h
; CHECK-NEXT: add v3.8h, v3.8h, v5.8h
-; CHECK-NEXT: rshrn v4.8b, v4.8h, #2
-; CHECK-NEXT: rshrn v0.8b, v3.8h, #2
-; CHECK-NEXT: str s4, [x0, x1]
-; CHECK-NEXT: str s1, [x0, x8]
-; CHECK-NEXT: str s0, [x0, x9]
+; CHECK-NEXT: rshrn v0.8b, v4.8h, #2
+; CHECK-NEXT: str s0, [x0, x1]
+; CHECK-NEXT: urhadd v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: rshrn v1.8b, v3.8h, #2
+; CHECK-NEXT: str s0, [x0, x8]
+; CHECK-NEXT: str s1, [x0, x9]
; CHECK-NEXT: ret
%5 = load i32, ptr %2, align 4
%6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
index e56b9b4820da..4bae08c0890e 100644
--- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
@@ -125,11 +125,10 @@ entry:
; CHECK-LABEL: f_usedefvectorload
; CHECK: cmp sp, #0
; CHECK-NEXT: csetm x16, ne
-; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: and x1, x1, x16
; CHECK-NEXT: csdb
; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp
-; CHECK-NEXT: ld1 { v0.d }[0], [x1]
+; CHECK-NEXT: ldr d0, [x1]
; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16
; CHECK-NEXT: mov sp, [[TMPREG]]
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list