[llvm] 11b4dce - [SVE] Lower fixed-length floating point loads and stores to integer variants.

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 26 03:14:42 PDT 2022


Author: Paul Walker
Date: 2022-08-26T11:10:23+01:00
New Revision: 11b4dce7d37f9b88d4b060c52dfd0a1f15493aa9

URL: https://github.com/llvm/llvm-project/commit/11b4dce7d37f9b88d4b060c52dfd0a1f15493aa9
DIFF: https://github.com/llvm/llvm-project/commit/11b4dce7d37f9b88d4b060c52dfd0a1f15493aa9.diff

LOG: [SVE] Lower fixed-length floating point loads and stores to integer variants.

There's no advatange to emitting floating point scalable accesses,
whereas by lowering them to integer variants we can benefit from
several combines that seek to replace explicit extends/truncates
with extending/truncating accesses.

Differential Revision: https://reviews.llvm.org/D132393

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3eb07a069502b..f9e141fc893da 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21259,7 +21259,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
 
   auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
 
-  if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+  if (VT.isFloatingPoint()) {
     LoadVT = ContainerVT.changeTypeToInteger();
     MemVT = MemVT.changeTypeToInteger();
   }
@@ -21277,6 +21277,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
     Result = getSVESafeBitCast(ExtendVT, Result, DAG);
     Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
                          Pg, Result, DAG.getUNDEF(ContainerVT));
+  } else if (VT.isFloatingPoint()) {
+    Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
   }
 
   Result = convertFromScalableVector(DAG, VT, Result);
@@ -21367,6 +21369,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
                            DAG.getUNDEF(TruncVT));
     NewValue =
         getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
+  } else if (VT.isFloatingPoint()) {
+    MemVT = MemVT.changeTypeToInteger();
+    NewValue =
+        getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
   }
 
   return DAG.getMaskedStore(Store->getChain(), DL, NewValue,

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index 49c929d564140..ae378281f5dc2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -161,10 +161,8 @@ define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v16f16_v16i32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
@@ -177,10 +175,8 @@ define void @fcvtzu_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
 define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f16_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -193,10 +189,8 @@ define void @fcvtzu_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(
 define void @fcvtzu_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v64f16_v64i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -287,11 +281,8 @@ define void @fcvtzu_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
 define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -304,11 +295,8 @@ define void @fcvtzu_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(
 define void @fcvtzu_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f16_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -579,10 +567,8 @@ define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v8f32_v8i64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
@@ -595,10 +581,8 @@ define void @fcvtzu_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
 define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzu_v16f32_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl16
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -611,10 +595,8 @@ define void @fcvtzu_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range
 define void @fcvtzu_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzu_v32f32_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -1087,10 +1069,8 @@ define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v16f16_v16i32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
@@ -1103,10 +1083,8 @@ define void @fcvtzs_v16f16_v16i32(<16 x half>* %a, <16 x i32>* %b) #0 {
 define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f16_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -1119,10 +1097,8 @@ define void @fcvtzs_v32f16_v32i32(<32 x half>* %a, <32 x i32>* %b) vscale_range(
 define void @fcvtzs_v64f16_v64i32(<64 x half>* %a, <64 x i32>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v64f16_v64i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -1213,11 +1189,8 @@ define void @fcvtzs_v8f16_v8i64(<8 x half>* %a, <8 x i64>* %b) #0 {
 define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -1230,11 +1203,8 @@ define void @fcvtzs_v16f16_v16i64(<16 x half>* %a, <16 x i64>* %b) vscale_range(
 define void @fcvtzs_v32f16_v32i64(<32 x half>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f16_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -1505,10 +1475,8 @@ define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v8f32_v8i64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
@@ -1521,10 +1489,8 @@ define void @fcvtzs_v8f32_v8i64(<8 x float>* %a, <8 x i64>* %b) #0 {
 define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: fcvtzs_v16f32_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl16
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -1537,10 +1503,8 @@ define void @fcvtzs_v16f32_v16i64(<16 x float>* %a, <16 x i64>* %b) vscale_range
 define void @fcvtzs_v32f32_v32i64(<32 x float>* %a, <32 x i64>* %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: fcvtzs_v32f32_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 150fc0acaa78a..f4e1a8285d6f2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -696,10 +696,9 @@ define void @ucvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) vscale_range(8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
-; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = uitofp <16 x i64> %op1 to <16 x half>
@@ -714,10 +713,9 @@ define void @ucvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) vscale_range(1
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
-; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = uitofp <32 x i64> %op1 to <32 x half>
@@ -1638,10 +1636,9 @@ define void @scvtf_v16i64_v16f16(<16 x i64>* %a, <16 x half>* %b) vscale_range(8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
-; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, <16 x i64>* %a
   %res = sitofp <16 x i64> %op1 to <16 x half>
@@ -1656,10 +1653,9 @@ define void @scvtf_v32i64_v32f16(<32 x i64>* %a, <32 x half>* %b) vscale_range(1
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
-; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, <32 x i64>* %a
   %res = sitofp <32 x i64> %op1 to <32 x half>

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index e6dc79b85cdd6..6a2e40a3d6179 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -677,13 +677,11 @@ define void @masked_gather_v16f16(<16 x half>* %a, <16 x half*>* %b) vscale_rang
 ; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x half>, <16 x half>* %a
   %ptrs = load <16 x half*>, <16 x half*>* %b
@@ -700,13 +698,11 @@ define void @masked_gather_v32f16(<32 x half>* %a, <32 x half*>* %b) vscale_rang
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x half>, <32 x half>* %a
   %ptrs = load <32 x half*>, <32 x half*>* %b
@@ -793,11 +789,10 @@ define void @masked_gather_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; VBITS_GE_512-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_512-NEXT:    punpklo p1.h, p1.b
-; VBITS_GE_512-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; VBITS_GE_512-NEXT:    st1w { z0.d }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %cval = load <8 x float>, <8 x float>* %a
   %ptrs = load <8 x float*>, <8 x float*>* %b
@@ -814,11 +809,10 @@ define void @masked_gather_v16f32(<16 x float>* %a, <16 x float*>* %b) vscale_ra
 ; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x float>, <16 x float>* %a
   %ptrs = load <16 x float*>, <16 x float*>* %b
@@ -835,11 +829,10 @@ define void @masked_gather_v32f32(<32 x float>* %a, <32 x float*>* %b) vscale_ra
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x float>, <32 x float>* %a
   %ptrs = load <32 x float*>, <32 x float*>* %b
@@ -993,11 +986,10 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw #1]
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, sxtw #1]
+; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -1056,11 +1048,10 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw #1]
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, uxtw #1]
+; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -1079,11 +1070,10 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, sxtw]
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, sxtw]
+; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -1103,11 +1093,10 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1h { z0.s }, p1/z, [x2, z1.s, uxtw]
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, uxtw]
+; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, <32 x half>* %a
   %idxs = load <32 x i32>, <32 x i32>* %b
@@ -1127,11 +1116,10 @@ define void @masked_gather_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z1.d, lsl #2]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i64>, <32 x i64>* %b
@@ -1149,11 +1137,10 @@ define void @masked_gather_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %b
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %idxs = load <32 x i64>, <32 x i64>* %b
@@ -1172,11 +1159,10 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x2, z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %bases = load <32 x i8*>, <32 x i8*>* %b
@@ -1195,11 +1181,10 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) vscale
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d, #4]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d, #4]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %bases = load <32 x i8*>, <32 x i8*>* %b
@@ -1242,11 +1227,10 @@ define void @masked_gather_passthru_0(<32 x float>* %a, <32 x float*>* %b) vscal
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, <32 x float>* %a
   %ptrs = load <32 x float*>, <32 x float*>* %b


        


More information about the llvm-commits mailing list