[llvm] r286168 - [AArch64] Transfer memory operands when lowering vector load/store intrinsics
Sanjin Sijaric via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 7 14:39:02 PST 2016
Author: ssijaric
Date: Mon Nov 7 16:39:02 2016
New Revision: 286168
URL: http://llvm.org/viewvc/llvm-project?rev=286168&view=rev
Log:
[AArch64] Transfer memory operands when lowering vector load/store intrinsics
Summary:
Some vector loads and stores generated from AArch64 intrinsics alias each other
unnecessarily, preventing better scheduling. We just need to transfer memory
operands during lowering.
Reviewers: mcrosier, t.p.northover, jmolloy
Subscribers: aemerson, rengolin, llvm-commits
Differential Revision: https://reviews.llvm.org/D26313
Added:
llvm/trunk/test/CodeGen/AArch64/sched-past-vector-ldst.ll
Modified:
llvm/trunk/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp?rev=286168&r1=286167&r2=286168&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp Mon Nov 7 16:39:02 2016
@@ -1154,6 +1154,12 @@ void AArch64DAGToDAGISel::SelectLoad(SDN
CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
CurDAG->RemoveDeadNode(N);
}
@@ -1202,6 +1208,11 @@ void AArch64DAGToDAGISel::SelectStore(SD
SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
ReplaceNode(N, St);
}
Modified: llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll?rev=286168&r1=286167&r2=286168&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/arm64-misched-basic-A53.ll Mon Nov 7 16:39:02 2016
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
;
; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
; much higher than the ADD instructions in order to hide latency. When not
Added: llvm/trunk/test/CodeGen/AArch64/sched-past-vector-ldst.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/sched-past-vector-ldst.ll?rev=286168&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/sched-past-vector-ldst.ll (added)
+++ llvm/trunk/test/CodeGen/AArch64/sched-past-vector-ldst.ll Mon Nov 7 16:39:02 2016
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mcpu=cortex-a53 -enable-post-misched=false -enable-aa-sched-mi | FileCheck %s
+
+; Check that the vector store intrinsic does not prevent fmla instructions from
+; being scheduled together. Since the vector loads and stores generated from
+; the intrinsics do not alias each other, the store can be pushed past the load.
+; This allows fmla instructions to be scheduled together.
+
+
+; CHECK: fmla
+; CHECK-NEXT: fmla
+; CHECK-NEXT: fmla
+; CHECK-NEXT: fmla
+target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%Struct = type { i64*, [9 x double], [16 x {float, float}], [16 x {float, float}], i32, i32 }
+
+; Function Attrs: nounwind
+define linkonce_odr void @func(%Struct* nocapture %this) unnamed_addr #0 align 2 {
+entry:
+ %0 = insertelement <4 x float> undef, float undef, i32 0
+ %1 = insertelement <4 x float> %0, float undef, i32 1
+ %2 = insertelement <4 x float> %1, float undef, i32 2
+ %3 = insertelement <4 x float> %2, float undef, i32 3
+ %scevgep = getelementptr %Struct, %Struct* %this, i64 0, i32 2, i64 8, i32 0
+ %struct_ptr = bitcast float* %scevgep to i8*
+ %vec1 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8* %struct_ptr)
+ %ev1 = extractvalue { <4 x float>, <4 x float> } %vec1, 1
+ %fm1 = fmul <4 x float> %0, %ev1
+ %av1 = fadd <4 x float> %1, %fm1
+ %ev2 = extractvalue { <4 x float>, <4 x float> } %vec1, 0
+ %fm2 = fmul <4 x float> %2, %ev2
+ %av2 = fadd <4 x float> %3, %fm2
+ %scevgep2 = getelementptr %Struct, %Struct* %this, i64 0, i32 3, i64 8, i32 0
+ %struct_ptr2 = bitcast float* %scevgep2 to i8*
+ tail call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> %av2, <4 x float> %av1, i8* %struct_ptr2)
+ %scevgep3 = getelementptr %Struct, %Struct* %this, i64 0, i32 2, i64 12, i32 0
+ %struct_ptr3 = bitcast float* %scevgep3 to i8*
+ %vec2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8* %struct_ptr3)
+ %ev3 = extractvalue { <4 x float>, <4 x float> } %vec2, 1
+ %fm3 = fmul <4 x float> %0, %ev3
+ %av3 = fadd <4 x float> %1, %fm3
+ %ev4 = extractvalue { <4 x float>, <4 x float> } %vec2, 0
+ %fm4 = fmul <4 x float> %2, %ev4
+ %av4 = fadd <4 x float> %3, %fm4
+ %scevgep4 = getelementptr %Struct, %Struct* %this, i64 0, i32 3, i64 12, i32 0
+ %struct_ptr4 = bitcast float* %scevgep4 to i8*
+ tail call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> %av4, <4 x float> %av3, i8* %struct_ptr4)
+ ret void
+}
+
+; Function Attrs: nounwind readonly
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8*) #2
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float>, <4 x float>, i8* nocapture) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
More information about the llvm-commits
mailing list