[llvm-branch-commits] [llvm] ca372df - [AArch64] Fix arm neon vstx lane memVT size
Tobias Hieta via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Sep 3 23:30:35 PDT 2023
Author: hstk30
Date: 2023-09-04T08:26:32+02:00
New Revision: ca372df48a05534d42130e6ecb4f6b275e003a08
URL: https://github.com/llvm/llvm-project/commit/ca372df48a05534d42130e6ecb4f6b275e003a08
DIFF: https://github.com/llvm/llvm-project/commit/ca372df48a05534d42130e6ecb4f6b275e003a08.diff
LOG: [AArch64] Fix arm neon vstx lane memVT size
StN lane memory size set too big lead to alias analysis goes wrong.
Fixes https://github.com/llvm/llvm-project/issues/64696
Differential Revision: https://reviews.llvm.org/D158611
(cherry picked from commit db8f6c009e5a17d304be7404e50eb20b2dd0c75b)
Added:
llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0605dfa6379399c..c7a6dd7deb45b3a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13840,7 +13840,17 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
- case Intrinsic::aarch64_neon_ld1x4:
+ case Intrinsic::aarch64_neon_ld1x4: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+ Info.offset = 0;
+ Info.align.reset();
+ // volatile loads with NEON intrinsics not supported
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
@@ -13848,9 +13858,13 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
- // Conservatively set memVT to the entire set of vectors loaded.
- uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
- Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ // ldx return struct with the same vec type
+ Type *RetTy = I.getType();
+ auto *StructTy = cast<StructType>(RetTy);
+ unsigned NumElts = StructTy->getNumElements();
+ Type *VecTy = StructTy->getElementType(0);
+ MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
Info.offset = 0;
Info.align.reset();
@@ -13863,20 +13877,40 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
- case Intrinsic::aarch64_neon_st1x4:
+ case Intrinsic::aarch64_neon_st1x4: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ unsigned NumElts = 0;
+ for (const Value *Arg : I.args()) {
+ Type *ArgTy = Arg->getType();
+ if (!ArgTy->isVectorTy())
+ break;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+ }
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
+ Info.offset = 0;
+ Info.align.reset();
+ // volatile stores with NEON intrinsics not supported
+ Info.flags = MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane: {
Info.opc = ISD::INTRINSIC_VOID;
- // Conservatively set memVT to the entire set of vectors stored.
unsigned NumElts = 0;
+ // all the vector type is same
+ Type *VecTy = I.getArgOperand(0)->getType();
+ MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
+
for (const Value *Arg : I.args()) {
Type *ArgTy = Arg->getType();
if (!ArgTy->isVectorTy())
break;
- NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+ NumElts += 1;
}
- Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
Info.offset = 0;
Info.align.reset();
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
new file mode 100644
index 000000000000000..7642597c91f2bd6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-st-lane-aa.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -O2 | FileCheck %s
+
+; st2 must before two ldrb.
+; The situation that put one ldrb before st2 because of the conservative memVT set for st2lane,
+; which lead to basic-aa goes wrong.
+
+define dso_local i32 @test_vst2_lane_u8([2 x <8 x i8>] %vectors.coerce) local_unnamed_addr {
+; CHECK-LABEL: test_vst2_lane_u8:
+; CHECK: st2 { v[[V1:[0-9]+]].b, v[[V2:[0-9]+]].b }[6], [x8]
+; CHECK-NEXT: umov w[[W1:[0-9]+]], v[[V12:[0-9]+]].b[6]
+; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #12]
+; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #13]
+entry:
+ %temp = alloca [2 x i8], align 4
+ %vectors.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 0
+ %vectors.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 1
+ call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %temp) #4
+ call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> %vectors.coerce.fca.0.extract, <8 x i8> %vectors.coerce.fca.1.extract, i64 6, ptr nonnull %temp)
+ %0 = load i8, ptr %temp, align 4
+ %vget_lane = extractelement <8 x i8> %vectors.coerce.fca.0.extract, i64 6
+ %cmp8.not = icmp ne i8 %0, %vget_lane
+ %arrayidx3.1 = getelementptr inbounds [2 x i8], ptr %temp, i64 0, i64 1
+ %1 = load i8, ptr %arrayidx3.1, align 1
+ %vget_lane.1 = extractelement <8 x i8> %vectors.coerce.fca.1.extract, i64 6
+ %cmp8.not.1 = icmp ne i8 %1, %vget_lane.1
+ %or.cond = select i1 %cmp8.not, i1 true, i1 %cmp8.not.1
+ %cmp.lcssa = zext i1 %or.cond to i32
+ call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %temp) #4
+ ret i32 %cmp.lcssa
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8>, <8 x i8>, i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
diff --git a/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
new file mode 100644
index 000000000000000..ecb953366a88ebe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/multi-vector-load-size.ll
@@ -0,0 +1,106 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
+
+%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*)
+
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define %struct.__neon_float32x2x2_t @test_ld2(float* %addr) {
+ ; CHECK-LABEL: name: test_ld2
+ ; CHECK: LD2Twov2s {{.*}} :: (load (s128) {{.*}})
+ %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3(float* %addr) {
+ ; CHECK-LABEL: name: test_ld3
+ ; CHECK: LD3Threev2s {{.*}} :: (load (s192) {{.*}})
+ %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4(float* %addr) {
+ ; CHECK-LABEL: name: test_ld4
+ ; CHECK: LD4Fourv2s {{.*}} :: (load (s256) {{.*}})
+ %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld1x2(float* %addr) {
+ ; CHECK-LABEL: name: test_ld1x2
+ ; CHECK: LD1Twov2s {{.*}} :: (load (s128) {{.*}})
+ %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld1x3(float* %addr) {
+ ; CHECK-LABEL: name: test_ld1x3
+ ; CHECK: LD1Threev2s {{.*}} :: (load (s192) {{.*}})
+ %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld1x4(float* %addr) {
+ ; CHECK-LABEL: name: test_ld1x4
+ ; CHECK: LD1Fourv2s {{.*}} :: (load (s256) {{.*}})
+ %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld2r(float* %addr) {
+ ; CHECK-LABEL: name: test_ld2r
+ ; CHECK: LD2Rv2s {{.*}} :: (load (s64) {{.*}})
+ %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3r(float* %addr) {
+ ; CHECK-LABEL: name: test_ld3r
+ ; CHECK: LD3Rv2s {{.*}} :: (load (s96) {{.*}})
+ %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4r(float* %addr) {
+ ; CHECK-LABEL: name: test_ld4r
+ ; CHECK: LD4Rv2s {{.*}} :: (load (s128) {{.*}})
+ %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x2_t @test_ld2lane(<2 x float> %a, <2 x float> %b, float* %addr) {
+ ; CHECK-LABEL: name: test_ld2lane
+ ; CHECK: {{.*}} LD2i32 {{.*}}
+ %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, i64 1, float* %addr)
+ ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x3_t @test_ld3lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, float* %addr) {
+ ; CHECK-LABEL: name: test_ld3lane
+ ; CHECK: {{.*}} LD3i32 {{.*}}
+ %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, i64 1, float* %addr)
+ ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x4_t @test_ld4lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, float* %addr) {
+ ; CHECK-LABEL: name: test_ld4lane
+ ; CHECK: {{.*}} LD4i32 {{.*}}
+ %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i64 1, float* %addr)
+ ret %struct.__neon_float32x2x4_t %val
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
index 5763ec61667f2a4..3710db9c47ff633 100644
--- a/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
+++ b/llvm/test/CodeGen/AArch64/multi-vector-store-size.ll
@@ -23,8 +23,6 @@ define void @addstx(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
%cr = fadd <4 x float> %cl, %dl
%dr = fadd <4 x float> %dl, %al
-; The sizes below are conservative. AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
; CHECK: ST2Twov4s {{.*}} :: (store (s256) {{.*}})
tail call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -46,8 +44,6 @@ define void @addst1x(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
%cr = fadd <4 x float> %cl, %dl
%dr = fadd <4 x float> %dl, %al
-; The sizes below are conservative. AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
tail call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
; CHECK: ST1Twov4s {{.*}} :: (store (s256) {{.*}})
tail call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -69,14 +65,12 @@ define void @addstxlane(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
%cr = fadd <4 x float> %cl, %dl
%dr = fadd <4 x float> %dl, %al
-; The sizes below are conservative. AArch64TargetLowering
-; conservatively assumes the entire vector is stored.
tail call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, i64 1, ptr %res)
-; CHECK: ST2i32 {{.*}} :: (store (s256) {{.*}})
+; CHECK: ST2i32 {{.*}} :: (store (s64) {{.*}})
tail call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, ptr %res)
-; CHECK: ST3i32 {{.*}} :: (store (s384) {{.*}})
+; CHECK: ST3i32 {{.*}} :: (store (s96) {{.*}})
tail call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, ptr %res)
-; CHECK: ST4i32 {{.*}} :: (store (s512) {{.*}})
+; CHECK: ST4i32 {{.*}} :: (store (s128) {{.*}})
ret void
}
More information about the llvm-branch-commits
mailing list