[llvm] 43b2df0 - [LegalizeTypes][AArch64] Use scalar_to_vector to eliminate bitcast

Wed Jun 14 08:34:37 PDT 2023

Author: zhongyunde
Date: 2023-06-14T23:33:02+08:00
New Revision: 43b2df03e842f109121ff60e6b62097b5a6c41ea

URL: https://github.com/llvm/llvm-project/commit/43b2df03e842f109121ff60e6b62097b5a6c41ea
DIFF: https://github.com/llvm/llvm-project/commit/43b2df03e842f109121ff60e6b62097b5a6c41ea.diff

LOG: [LegalizeTypes][AArch64] Use scalar_to_vector to eliminate bitcast

```
Legalize t3: v2i16 = bitcast i32
with   (v2i16 extract_subvector (v4i16 bitcast (v2i32 scalar_to_vector (i32 in))), 0)
```
Fix https://github.com/llvm/llvm-project/issues/61638

NOTE: Don't touch getPreferredVectorAction like X86 as this will touch
too many test cases.

Reviewed By: dmgreen, paulwalker-arm, efriedma
Differential Revision: https://reviews.llvm.org/D147678

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
    llvm/test/CodeGen/AArch64/neon-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8bf75e4aee5f0..41b8e991e75ab 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1228,6 +1228,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i8, Custom);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 
+    setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
+
     setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
@@ -23019,6 +23023,23 @@ static void replaceBoolVectorBitcast(SDNode *N,
     Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
 }
 
+static void CustomNonLegalBITCASTResults(SDNode *N,
+                                         SmallVectorImpl<SDValue> &Results,
+                                         SelectionDAG &DAG, EVT ExtendVT,
+                                         EVT CastVT) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Use SCALAR_TO_VECTOR for lane zero
+  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
+  SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
+  SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
+  Results.push_back(
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
+  return;
+}
+
 void AArch64TargetLowering::ReplaceBITCASTResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   SDLoc DL(N);
@@ -23026,6 +23047,21 @@ void AArch64TargetLowering::ReplaceBITCASTResults(
   EVT VT = N->getValueType(0);
   EVT SrcVT = Op.getValueType();
 
+  if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
+    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
+    return;
+  }
+
+  if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
+    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
+    return;
+  }
+
+  if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
+    CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
+    return;
+  }
+
   if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
     assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
            "Expected fp->int bitcast!");

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index bc95cfd7d28d9..804c1e7cfc363 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -446,25 +446,16 @@ define <4 x i32> @anyext_v4i32(ptr %a, ptr %b) {
 define <4 x i8> @bitcast(i32 %0) {
 ; CHECK-LE-LABEL: bitcast:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    sub sp, sp, #16
-; CHECK-LE-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-LE-NEXT:    str w0, [sp, #12]
-; CHECK-LE-NEXT:    ldr s0, [sp, #12]
-; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-LE-NEXT:    add sp, sp, #16
+; CHECK-LE-NEXT:    fmov s0, w0
+; CHECK-LE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: bitcast:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    sub sp, sp, #16
-; CHECK-BE-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-BE-NEXT:    str w0, [sp, #12]
-; CHECK-BE-NEXT:    ldr s0, [sp, #12]
+; CHECK-BE-NEXT:    fmov s0, w0
 ; CHECK-BE-NEXT:    rev32 v0.8b, v0.8b
-; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
-; CHECK-BE-NEXT:    add sp, sp, #16
+; CHECK-BE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    ret
   %2 = bitcast i32 %0 to <4 x i8>
   ret <4 x i8> %2

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
index 2b060f436c79a..bfd59f3d813c8 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -514,3 +514,66 @@ define <16 x i8> @test_v2f64_to_v16i8(<2 x double> %in) nounwind{
   ret <16 x i8> %val
 }
 
+define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) {
+; CHECK-LE-LABEL: bitcast_i32_to_v2i16:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    fmov s0, w0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: bitcast_i32_to_v2i16:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov s0, w0
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %ret = bitcast i32 %word to <2 x i16>
+  ret <2 x i16> %ret
+}
+
+define <4 x i8> @bitcast_i32_to_v4i8(i32 %word) {
+; CHECK-LE-LABEL: bitcast_i32_to_v4i8:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    fmov s0, w0
+; CHECK-LE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: bitcast_i32_to_v4i8:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov s0, w0
+; CHECK-BE-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-BE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ret
+  %ret = bitcast i32 %word to <4 x i8>
+  ret <4 x i8> %ret
+}
+
+; TODO: Eliminate redundant moving back and forth between gpr and vectors
+define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
+; CHECK-LE-LABEL: bitcast_i16_to_v2i8:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    fmov s0, w0
+; CHECK-LE-NEXT:    umov w8, v0.b[0]
+; CHECK-LE-NEXT:    umov w9, v0.b[1]
+; CHECK-LE-NEXT:    fmov s0, w8
+; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: bitcast_i16_to_v2i8:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    fmov s0, w0
+; CHECK-BE-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-BE-NEXT:    umov w8, v0.b[0]
+; CHECK-BE-NEXT:    umov w9, v0.b[1]
+; CHECK-BE-NEXT:    fmov s0, w8
+; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %ret = bitcast i16 %word to <2 x i8>
+  ret <2 x i8> %ret
+}
+