[llvm] 2240409 - [SVE] Restrict SVE fixed length extload/truncstore combine to float and double types.
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 13 05:18:34 PST 2023
Author: Paul Walker
Date: 2023-01-13T13:16:42Z
New Revision: 22404099c48cff1c118cd4f6fd9e1acf7ff0fefb
URL: https://github.com/llvm/llvm-project/commit/22404099c48cff1c118cd4f6fd9e1acf7ff0fefb
DIFF: https://github.com/llvm/llvm-project/commit/22404099c48cff1c118cd4f6fd9e1acf7ff0fefb.diff
LOG: [SVE] Restrict SVE fixed length extload/truncstore combine to float and double types.
Prior to this patch we would create floating point extending load
and truncating store operations involving fp128 types, which we
cannot lower.
Fixes #58530
Differential Revision: https://reviews.llvm.org/D140318
Added:
llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 24c72f95e0103..1bfce0d993fd2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19315,6 +19315,12 @@ static SDValue performSTORECombine(SDNode *N,
SDValue Chain = ST->getChain();
SDValue Value = ST->getValue();
SDValue Ptr = ST->getBasePtr();
+ EVT ValueVT = Value.getValueType();
+
+ auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
+ EVT EltVT = VT.getVectorElementType();
+ return EltVT == MVT::f32 || EltVT == MVT::f64;
+ };
// If this is an FP_ROUND followed by a store, fold this into a truncating
// store. We can do this even if this is already a truncstore.
@@ -19323,9 +19329,9 @@ static SDValue performSTORECombine(SDNode *N,
if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
Value.getNode()->hasOneUse() && ST->isUnindexed() &&
Subtarget->useSVEForFixedLengthVectors() &&
- Value.getValueType().isFixedLengthVector() &&
- Value.getValueType().getFixedSizeInBits() >=
- Subtarget->getMinSVEVectorSizeInBits())
+ ValueVT.isFixedLengthVector() &&
+ ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
+ hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
ST->getMemoryVT(), ST->getMemOperand());
@@ -21238,12 +21244,17 @@ static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
return SDValue();
+ auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
+ EVT EltVT = VT.getVectorElementType();
+ return EltVT == MVT::f32 || EltVT == MVT::f64;
+ };
+
// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
// We purposefully don't care about legality of the nodes here as we know
// they can be split down into something legal.
if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
- VT.isFixedLengthVector() &&
+ VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
new file mode 100644
index 0000000000000..8e7c6c6703a62
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Ensure we don't attempt to combine into an extending fp128 load.
+define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcvt_v4f64_v4f128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: add x8, sp, #48
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: bl __extenddftf2
+; CHECK-NEXT: add x8, sp, #48
+; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
+; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: mov d1, v1.d[1]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: bl __extenddftf2
+; CHECK-NEXT: add x8, sp, #48
+; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT: bl __extenddftf2
+; CHECK-NEXT: add x8, sp, #48
+; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: mov d1, v1.d[1]
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: bl __extenddftf2
+; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: stp q1, q0, [x19]
+; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: stp q0, q2, [x19, #32]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %op1 = load <4 x double>, ptr %a
+ %res = fpext <4 x double> %op1 to <4 x fp128>
+ store <4 x fp128> %res, ptr %b
+ ret void
+}
+
+; Ensure we don't attempt to combine into a truncating fp128 store.
+define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
+; CHECK-LABEL: fcvt_v4f128_v4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: ldr q1, [x0, #64]
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: ldr q0, [x0, #80]
+; CHECK-NEXT: stp q0, q1, [sp, #96] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q1, [x0, #96]
+; CHECK-NEXT: ldr q0, [x0, #112]
+; CHECK-NEXT: stp q0, q1, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q1, [x0]
+; CHECK-NEXT: ldr q0, [x0, #16]
+; CHECK-NEXT: stp q0, q1, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q0, [x0, #32]
+; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [x0, #48]
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: add x8, sp, #128
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add x8, sp, #128
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: add x8, sp, #128
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT: bl __trunctfdf2
+; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: add x9, sp, #128
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: mov x8, #4
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: ldr z1, [x9] // 16-byte Folded Reload
+; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT: ptrue p0.d, vl4
+; CHECK-NEXT: st1d { z0.d }, p0, [x19, x8, lsl #3]
+; CHECK-NEXT: add x8, sp, #128
+; CHECK-NEXT: ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: st1d { z0.d }, p0, [x19]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %op1 = load <8 x fp128>, ptr %a
+ %res = fptrunc <8 x fp128> %op1 to <8 x double>
+ store <8 x double> %res, ptr %b
+ ret void
+}
+
+attributes #0 = { nounwind "target-features"="+sve" }
More information about the llvm-commits
mailing list