[llvm] r264988 - [PowerPC] Load two floats directly instead of using one 64-bit integer load
Hal Finkel via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 30 19:56:05 PDT 2016
Author: hfinkel
Date: Wed Mar 30 21:56:05 2016
New Revision: 264988
URL: http://llvm.org/viewvc/llvm-project?rev=264988&view=rev
Log:
[PowerPC] Load two floats directly instead of using one 64-bit integer load
When dealing with complex<float>, and similar structures with two
single-precision floating-point numbers, especially when such things are being
passed around by value, we'll sometimes end up loading both float values by
extracting them from one 64-bit integer load. It looks like this:
t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
t16: i64 = srl t13, Constant:i32<32>
t17: i32 = truncate t16
t18: f32 = bitcast t17
t19: i32 = truncate t13
t20: f32 = bitcast t19
The problem, especially before the P8 where those bitcasts aren't legal (and
get expanded via the stack), is that it would have been better to use two
floating-point loads directly. Here we add a target-specific DAGCombine to do
just that. In short, we turn:
ld 3, 0(5)
stw 3, -8(1)
rldicl 3, 3, 32, 32
stw 3, -4(1)
lfs 3, -4(1)
lfs 0, -8(1)
into:
lfs 3, 4(5)
lfs 0, 0(5)
Added:
llvm/trunk/test/CodeGen/PowerPC/load-two-flts.ll
Modified:
llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=264988&r1=264987&r2=264988&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Wed Mar 30 21:56:05 2016
@@ -10268,6 +10268,111 @@ SDValue PPCTargetLowering::PerformDAGCom
return expandVSXLoadForLE(N, DCI);
}
+ // We sometimes end up with a 64-bit integer load, from which we extract
+ // two single-precision floating-point numbers. This happens with
+ // std::complex<float>, and other similar structures, because of the way we
+ // canonicalize structure copies. However, if we lack direct moves,
+ // then the final bitcasts from the extracted integer values to the
+ // floating-point numbers turn into store/load pairs. Even with direct moves,
+ // just loading the two floating-point numbers is likely better.
+ auto ReplaceTwoFloatLoad = [&]() {
+ if (VT != MVT::i64)
+ return false;
+
+ if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+ LD->isVolatile())
+ return false;
+
+ // We're looking for a sequence like this:
+ // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
+ // t16: i64 = srl t13, Constant:i32<32>
+ // t17: i32 = truncate t16
+ // t18: f32 = bitcast t17
+ // t19: i32 = truncate t13
+ // t20: f32 = bitcast t19
+
+ if (!LD->hasNUsesOfValue(2, 0))
+ return false;
+
+ auto UI = LD->use_begin();
+ while (UI.getUse().getResNo() != 0) ++UI;
+ SDNode *Trunc = *UI++;
+ while (UI.getUse().getResNo() != 0) ++UI;
+ SDNode *RightShift = *UI;
+ if (Trunc->getOpcode() != ISD::TRUNCATE)
+ std::swap(Trunc, RightShift);
+
+ if (Trunc->getOpcode() != ISD::TRUNCATE ||
+ Trunc->getValueType(0) != MVT::i32 ||
+ !Trunc->hasOneUse())
+ return false;
+ if (RightShift->getOpcode() != ISD::SRL ||
+ !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
+ RightShift->getConstantOperandVal(1) != 32 ||
+ !RightShift->hasOneUse())
+ return false;
+
+ SDNode *Trunc2 = *RightShift->use_begin();
+ if (Trunc2->getOpcode() != ISD::TRUNCATE ||
+ Trunc2->getValueType(0) != MVT::i32 ||
+ !Trunc2->hasOneUse())
+ return false;
+
+ SDNode *Bitcast = *Trunc->use_begin();
+ SDNode *Bitcast2 = *Trunc2->use_begin();
+
+ if (Bitcast->getOpcode() != ISD::BITCAST ||
+ Bitcast->getValueType(0) != MVT::f32)
+ return false;
+ if (Bitcast2->getOpcode() != ISD::BITCAST ||
+ Bitcast2->getValueType(0) != MVT::f32)
+ return false;
+
+ if (Subtarget.isLittleEndian())
+ std::swap(Bitcast, Bitcast2);
+
+ // Bitcast has the second float (in memory-layout order) and Bitcast2
+ // has the first one.
+
+ SDValue BasePtr = LD->getBasePtr();
+ if (LD->isIndexed()) {
+ assert(LD->getAddressingMode() == ISD::PRE_INC &&
+ "Non-pre-inc AM on PPC?");
+ BasePtr =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+ LD->getOffset());
+ }
+
+ SDValue FloatLoad =
+ DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
+ LD->getPointerInfo(), false, LD->isNonTemporal(),
+ LD->isInvariant(), LD->getAlignment(), LD->getAAInfo());
+ SDValue AddPtr =
+ DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+ BasePtr, DAG.getIntPtrConstant(4, dl));
+ SDValue FloatLoad2 =
+ DAG.getLoad(MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
+ LD->getPointerInfo().getWithOffset(4), false,
+ LD->isNonTemporal(), LD->isInvariant(),
+ MinAlign(LD->getAlignment(), 4), LD->getAAInfo());
+
+ if (LD->isIndexed()) {
+ // Note that DAGCombine should re-form any pre-increment load(s) from
+ // what is produced here if that makes sense.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
+ }
+
+ DCI.CombineTo(Bitcast2, FloatLoad);
+ DCI.CombineTo(Bitcast, FloatLoad2);
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
+ SDValue(FloatLoad2.getNode(), 1));
+ return true;
+ };
+
+ if (ReplaceTwoFloatLoad())
+ return SDValue(N, 0);
+
EVT MemVT = LD->getMemoryVT();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
Added: llvm/trunk/test/CodeGen/PowerPC/load-two-flts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/load-two-flts.ll?rev=264988&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/load-two-flts.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/load-two-flts.ll Wed Mar 30 21:56:05 2016
@@ -0,0 +1,60 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+ %v2 = load i64, i64* %ref.tmp, align 8
+ %v3 = lshr i64 %v2, 32
+ %v4 = trunc i64 %v3 to i32
+ %v5 = bitcast i32 %v4 to float
+ %v6 = trunc i64 %v2 to i32
+ %v7 = bitcast i32 %v6 to float
+ %mul_ad.i.i = fmul fast float %v5, %v1
+ %mul_bc.i.i = fmul fast float %v7, %v0
+ %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+ %mul_ac.i.i = fmul fast float %v5, %v0
+ %mul_bd.i.i = fmul fast float %v7, %v1
+ %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+ store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+ store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+ ret void
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 0(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
+; CHECK: blr
+}
+
+define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
+entry:
+ %r = getelementptr i64, i64* %ref.tmp, i64 1
+ %v2 = load i64, i64* %r, align 8
+ %v3 = lshr i64 %v2, 32
+ %v4 = trunc i64 %v3 to i32
+ %v5 = bitcast i32 %v4 to float
+ %v6 = trunc i64 %v2 to i32
+ %v7 = bitcast i32 %v6 to float
+ %mul_ad.i.i = fmul fast float %v5, %v1
+ %mul_bc.i.i = fmul fast float %v7, %v0
+ %mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
+ %mul_ac.i.i = fmul fast float %v5, %v0
+ %mul_bd.i.i = fmul fast float %v7, %v1
+ %mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
+ store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
+ store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
+ ret i64* %r
+
+; CHECK-LABEL: @_Z4testSt7complexIfE
+; CHECK-NOT: ld {{[0-9]+}}, 8(5)
+; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
+; CHECK-NOT: stw
+; CHECK-NOT: rldicl
+; CHECK-DAG: lfsu {{[0-9]+}}, 8(5)
+; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
+; CHECK: blr
+}
+
More information about the llvm-commits
mailing list