[llvm] 6180806 - [AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom
Bradley Smith via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 29 03:57:29 PST 2021
Author: Bradley Smith
Date: 2021-11-29T11:56:07Z
New Revision: 61808066325ff0828bab7f016e8798b78d2e6b49
URL: https://github.com/llvm/llvm-project/commit/61808066325ff0828bab7f016e8798b78d2e6b49
DIFF: https://github.com/llvm/llvm-project/commit/61808066325ff0828bab7f016e8798b78d2e6b49.diff
LOG: [AArch64][SVE] Mark fixed-type FP extending/truncating loads/stores as custom
This allows the generic DAG combine to fold fp_extend/fp_trunc into
loads/stores which we can then lower into a integer extending
load/truncating store plus an FP_EXTEND/FP_ROUND.
The nuance here is that fixed-type FP_EXTEND/FP_ROUND require unpacked
types hence lowering them introduces an unpack/zip. By allowing these
nodes to be combined with loads/store we make it much easier to have
this unpack/zip combined into the load/store by our custom lowering.
Differential Revision: https://reviews.llvm.org/D114580
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ce400ea43f29..9aaa1c8e7b3b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15007,7 +15007,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
- TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+ TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
LN0->getChain(),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3204e7a67083..d1c58086161e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1545,7 +1545,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setCondCodeAction(ISD::SETUNE, VT, Expand);
}
- // Mark integer truncating stores as having custom lowering
+ // Mark integer truncating stores/extending loads as having custom lowering
if (VT.isInteger()) {
MVT InnerVT = VT.changeVectorElementType(MVT::i8);
while (InnerVT != VT) {
@@ -1557,6 +1557,18 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
}
}
+ // Mark floating-point truncating stores/extending loads as having custom
+ // lowering
+ if (VT.isFloatingPoint()) {
+ MVT InnerVT = VT.changeVectorElementType(MVT::f16);
+ while (InnerVT != VT) {
+ setTruncStoreAction(VT, InnerVT, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
+ InnerVT = InnerVT.changeVectorElementType(
+ MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
+ }
+ }
+
// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ABS, VT, Custom);
setOperationAction(ISD::ADD, VT, Custom);
@@ -18658,12 +18670,29 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
SDLoc DL(Op);
EVT VT = Op.getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT LoadVT = ContainerVT;
+ EVT MemVT = Load->getMemoryVT();
+
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+ LoadVT = ContainerVT.changeTypeToInteger();
+ MemVT = MemVT.changeTypeToInteger();
+ }
auto NewLoad = DAG.getMaskedLoad(
- ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
- getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
- Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
- Load->getExtensionType());
+ LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
+ DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
+ Load->getAddressingMode(), Load->getExtensionType());
+
+ if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+ EVT ExtendVT = ContainerVT.changeVectorElementType(
+ Load->getMemoryVT().getVectorElementType());
+
+ NewLoad = getSVESafeBitCast(ExtendVT, NewLoad, DAG);
+ NewLoad = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+ Pg, NewLoad, DAG.getUNDEF(ContainerVT));
+ }
auto Result = convertFromScalableVector(DAG, VT, NewLoad);
SDValue MergedValues[2] = {Result, Load->getChain()};
@@ -18738,13 +18767,26 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
SDLoc DL(Op);
EVT VT = Store->getValue().getValueType();
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT MemVT = Store->getMemoryVT();
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
- return DAG.getMaskedStore(
- Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
- getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
- Store->getMemOperand(), Store->getAddressingMode(),
- Store->isTruncatingStore());
+
+ if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
+ EVT TruncVT = ContainerVT.changeVectorElementType(
+ Store->getMemoryVT().getVectorElementType());
+ MemVT = MemVT.changeTypeToInteger();
+ NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
+ NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
+ DAG.getUNDEF(TruncVT));
+ NewValue =
+ getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
+ }
+
+ return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
+ Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore());
}
SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
index f9d913570a3f..76260b32ff7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -48,9 +48,8 @@ define <4 x float> @fcvt_v4f16_v4f32(<4 x half> %op1) #0 {
define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
; CHECK-LABEL: fcvt_v8f16_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.s, vl8
-; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.s, p0/m, z0.h
; CHECK-NEXT: st1w { z0.s }, p0, [x1]
; CHECK-NEXT: ret
@@ -76,16 +75,15 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1]
; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
; VBITS_EQ_256-NEXT: ret
-
+;
; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.h, vl16
-; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
+
%op1 = load <16 x half>, <16 x half>* %a
%res = fpext <16 x half> %op1 to <16 x float>
store <16 x float> %res, <16 x float>* %b
@@ -95,10 +93,8 @@ define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v32f16_v32f32:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
@@ -111,10 +107,8 @@ define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v64f16_v64f32:
; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
@@ -156,10 +150,8 @@ define <2 x double> @fcvt_v2f16_v2f64(<2 x half> %op1) #0 {
define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
; CHECK-LABEL: fcvt_v4f16_v4f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ptrue p0.d, vl4
-; CHECK-NEXT: uunpklo z0.s, z0.h
-; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.h
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
@@ -170,7 +162,6 @@ define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
}
define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation.
; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
; VBITS_EQ_256: // %bb.0:
; VBITS_EQ_256-NEXT: ldr q0, [x0]
@@ -186,16 +177,15 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ret
-
+;
; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ldr q0, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
+
%op1 = load <8 x half>, <8 x half>* %a
%res = fpext <8 x half> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b
@@ -205,11 +195,8 @@ define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f16_v16f64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
@@ -222,11 +209,8 @@ define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f16_v32f64:
; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.h
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
@@ -264,9 +248,8 @@ define <2 x double> @fcvt_v2f32_v2f64(<2 x float> %op1) #0 {
define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
; CHECK-LABEL: fcvt_v4f32_v4f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ptrue p0.d, vl4
-; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; CHECK-NEXT: st1d { z0.d }, p0, [x1]
; CHECK-NEXT: ret
@@ -292,16 +275,15 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1]
; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
; VBITS_EQ_256-NEXT: ret
-
+;
; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64:
; VBITS_GE_512: // %bb.0:
-; VBITS_GE_512-NEXT: ptrue p0.s, vl8
-; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
+
%op1 = load <8 x float>, <8 x float>* %a
%res = fpext <8 x float> %op1 to <8 x double>
store <8 x double> %res, <8 x double>* %b
@@ -311,10 +293,8 @@ define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
; VBITS_GE_1024-LABEL: fcvt_v16f32_v16f64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
@@ -327,10 +307,8 @@ define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
; VBITS_GE_2048-LABEL: fcvt_v32f32_v32f64:
; VBITS_GE_2048: // %bb.0:
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: fcvt z0.d, p0/m, z0.s
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
@@ -403,11 +381,8 @@ define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ptrue p0.s
; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_512-NEXT: ptrue p0.h, vl16
-; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%res = fptrunc <16 x float> %op1 to <16 x half>
@@ -420,11 +395,8 @@ define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.s
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <32 x float>, <32 x float>* %a
%res = fptrunc <32 x float> %op1 to <32 x half>
@@ -437,11 +409,8 @@ define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.s
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <64 x float>, <64 x float>* %a
%res = fptrunc <64 x float> %op1 to <64 x half>
@@ -533,12 +502,8 @@ define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x half>
@@ -551,12 +516,8 @@ define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1]
+; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x half>
@@ -627,11 +588,8 @@ define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT: ptrue p0.d
; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_512-NEXT: ptrue p0.s, vl8
-; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%res = fptrunc <8 x double> %op1 to <8 x float>
@@ -644,11 +602,8 @@ define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
; VBITS_GE_1024: // %bb.0:
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_1024-NEXT: ptrue p0.d
; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl16
-; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_1024-NEXT: ret
%op1 = load <16 x double>, <16 x double>* %a
%res = fptrunc <16 x double> %op1 to <16 x float>
@@ -661,11 +616,8 @@ define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
; VBITS_GE_2048: // %bb.0:
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_2048-NEXT: ptrue p0.d
; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d
-; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
-; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1]
+; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x1]
; VBITS_GE_2048-NEXT: ret
%op1 = load <32 x double>, <32 x double>* %a
%res = fptrunc <32 x double> %op1 to <32 x float>
More information about the llvm-commits
mailing list