[llvm] 3d65f82 - [SVE] Expand scalable vector ISD::BITCASTs when targeting big-endian.

Paul Walker via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 10 04:03:13 PDT 2023


Author: Paul Walker
Date: 2023-08-10T11:02:01Z
New Revision: 3d65f8211f8d2ebf584bc2ac08a6a2f098130d79

URL: https://github.com/llvm/llvm-project/commit/3d65f8211f8d2ebf584bc2ac08a6a2f098130d79
DIFF: https://github.com/llvm/llvm-project/commit/3d65f8211f8d2ebf584bc2ac08a6a2f098130d79.diff

LOG: [SVE] Expand scalable vector ISD::BITCASTs when targeting big-endian.

Whilst sub-optimial, it's better than the current selection failure.

Fixes: #64406

Differential Revision: https://reviews.llvm.org/D157406

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-bitcast.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a04188903748fc..4d322cecaf8df2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1337,6 +1337,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::AVGFLOORU, VT, Custom);
       setOperationAction(ISD::AVGCEILS, VT, Custom);
       setOperationAction(ISD::AVGCEILU, VT, Custom);
+
+      if (!Subtarget->isLittleEndian())
+        setOperationAction(ISD::BITCAST, VT, Expand);
     }
 
     // Illegal unpacked integer vector types.
@@ -1486,6 +1489,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setCondCodeAction(ISD::SETUGT, VT, Expand);
       setCondCodeAction(ISD::SETUEQ, VT, Expand);
       setCondCodeAction(ISD::SETONE, VT, Expand);
+
+      if (!Subtarget->isLittleEndian())
+        setOperationAction(ISD::BITCAST, VT, Expand);
     }
 
     for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
@@ -1495,6 +1501,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
+
+      if (!Subtarget->isLittleEndian())
+        setOperationAction(ISD::BITCAST, VT, Expand);
     }
 
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);

diff  --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
index 40d57e4a451ca8..5f8fcb3d56e4b2 100644
--- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
-; RUN: not --crash llc -mtriple=aarch64_be < %s
+; RUN: llc -mtriple=aarch64_be < %s | FileCheck %s --check-prefix=CHECK_BE
 
 ;
 ; bitcast to nxv16i8
@@ -10,6 +10,18 @@ define <vscale x 16 x i8> @bitcast_nxv8i16_to_nxv16i8(<vscale x 8 x i16> %v) #0
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -18,6 +30,18 @@ define <vscale x 16 x i8> @bitcast_nxv4i32_to_nxv16i8(<vscale x 4 x i32> %v) #0
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -26,6 +50,18 @@ define <vscale x 16 x i8> @bitcast_nxv2i64_to_nxv16i8(<vscale x 2 x i64> %v) #0
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -34,6 +70,18 @@ define <vscale x 16 x i8> @bitcast_nxv8f16_to_nxv16i8(<vscale x 8 x half> %v) #0
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -42,6 +90,18 @@ define <vscale x 16 x i8> @bitcast_nxv4f32_to_nxv16i8(<vscale x 4 x float> %v) #
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -50,6 +110,18 @@ define <vscale x 16 x i8> @bitcast_nxv2f64_to_nxv16i8(<vscale x 2 x double> %v)
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -58,6 +130,18 @@ define <vscale x 16 x i8> @bitcast_nxv8bf16_to_nxv16i8(<vscale x 8 x bfloat> %v)
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv16i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %bc
 }
@@ -70,6 +154,18 @@ define <vscale x 8 x i16> @bitcast_nxv16i8_to_nxv8i16(<vscale x 16 x i8> %v) #0
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -78,6 +174,18 @@ define <vscale x 8 x i16> @bitcast_nxv4i32_to_nxv8i16(<vscale x 4 x i32> %v) #0
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -86,6 +194,18 @@ define <vscale x 8 x i16> @bitcast_nxv2i64_to_nxv8i16(<vscale x 2 x i64> %v) #0
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -94,6 +214,17 @@ define <vscale x 8 x i16> @bitcast_nxv8f16_to_nxv8i16(<vscale x 8 x half> %v) #0
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -102,6 +233,18 @@ define <vscale x 8 x i16> @bitcast_nxv4f32_to_nxv8i16(<vscale x 4 x float> %v) #
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -110,6 +253,18 @@ define <vscale x 8 x i16> @bitcast_nxv2f64_to_nxv8i16(<vscale x 2 x double> %v)
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -118,6 +273,17 @@ define <vscale x 8 x i16> @bitcast_nxv8bf16_to_nxv8i16(<vscale x 8 x bfloat> %v)
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv8i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %bc
 }
@@ -130,6 +296,18 @@ define <vscale x 4 x i32> @bitcast_nxv16i8_to_nxv4i32(<vscale x 16 x i8> %v) #0
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -138,6 +316,18 @@ define <vscale x 4 x i32> @bitcast_nxv8i16_to_nxv4i32(<vscale x 8 x i16> %v) #0
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -146,6 +336,18 @@ define <vscale x 4 x i32> @bitcast_nxv2i64_to_nxv4i32(<vscale x 2 x i64> %v) #0
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -154,6 +356,18 @@ define <vscale x 4 x i32> @bitcast_nxv8f16_to_nxv4i32(<vscale x 8 x half> %v) #0
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -162,6 +376,17 @@ define <vscale x 4 x i32> @bitcast_nxv4f32_to_nxv4i32(<vscale x 4 x float> %v) #
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -170,6 +395,18 @@ define <vscale x 4 x i32> @bitcast_nxv2f64_to_nxv4i32(<vscale x 2 x double> %v)
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -178,6 +415,18 @@ define <vscale x 4 x i32> @bitcast_nxv8bf16_to_nxv4i32(<vscale x 8 x bfloat> %v)
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv4i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %bc
 }
@@ -190,6 +439,18 @@ define <vscale x 2 x i64> @bitcast_nxv16i8_to_nxv2i64(<vscale x 16 x i8> %v) #0
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -198,6 +459,18 @@ define <vscale x 2 x i64> @bitcast_nxv8i16_to_nxv2i64(<vscale x 8 x i16> %v) #0
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -206,6 +479,18 @@ define <vscale x 2 x i64> @bitcast_nxv4i32_to_nxv2i64(<vscale x 4 x i32> %v) #0
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -214,6 +499,18 @@ define <vscale x 2 x i64> @bitcast_nxv8f16_to_nxv2i64(<vscale x 8 x half> %v) #0
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -222,6 +519,18 @@ define <vscale x 2 x i64> @bitcast_nxv4f32_to_nxv2i64(<vscale x 4 x float> %v) #
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -230,6 +539,17 @@ define <vscale x 2 x i64> @bitcast_nxv2f64_to_nxv2i64(<vscale x 2 x double> %v)
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -238,6 +558,18 @@ define <vscale x 2 x i64> @bitcast_nxv8bf16_to_nxv2i64(<vscale x 8 x bfloat> %v)
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv2i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %bc
 }
@@ -250,6 +582,18 @@ define <vscale x 8 x half> @bitcast_nxv16i8_to_nxv8f16(<vscale x 16 x i8> %v) #0
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -258,6 +602,17 @@ define <vscale x 8 x half> @bitcast_nxv8i16_to_nxv8f16(<vscale x 8 x i16> %v) #0
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -266,6 +621,18 @@ define <vscale x 8 x half> @bitcast_nxv4i32_to_nxv8f16(<vscale x 4 x i32> %v) #0
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -274,6 +641,18 @@ define <vscale x 8 x half> @bitcast_nxv2i64_to_nxv8f16(<vscale x 2 x i64> %v) #0
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -282,6 +661,18 @@ define <vscale x 8 x half> @bitcast_nxv4f32_to_nxv8f16(<vscale x 4 x float> %v)
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -290,6 +681,18 @@ define <vscale x 8 x half> @bitcast_nxv2f64_to_nxv8f16(<vscale x 2 x double> %v)
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -298,6 +701,17 @@ define <vscale x 8 x half> @bitcast_nxv8bf16_to_nxv8f16(<vscale x 8 x bfloat> %v
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv8f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 8 x half>
   ret <vscale x 8 x half> %bc
 }
@@ -310,6 +724,18 @@ define <vscale x 4 x float> @bitcast_nxv16i8_to_nxv4f32(<vscale x 16 x i8> %v) #
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -318,6 +744,18 @@ define <vscale x 4 x float> @bitcast_nxv8i16_to_nxv4f32(<vscale x 8 x i16> %v) #
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -326,6 +764,17 @@ define <vscale x 4 x float> @bitcast_nxv4i32_to_nxv4f32(<vscale x 4 x i32> %v) #
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -334,6 +783,18 @@ define <vscale x 4 x float> @bitcast_nxv2i64_to_nxv4f32(<vscale x 2 x i64> %v) #
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -342,6 +803,18 @@ define <vscale x 4 x float> @bitcast_nxv8f16_to_nxv4f32(<vscale x 8 x half> %v)
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -350,6 +823,18 @@ define <vscale x 4 x float> @bitcast_nxv2f64_to_nxv4f32(<vscale x 2 x double> %v
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -358,6 +843,18 @@ define <vscale x 4 x float> @bitcast_nxv8bf16_to_nxv4f32(<vscale x 8 x bfloat> %
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv4f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 4 x float>
   ret <vscale x 4 x float> %bc
 }
@@ -370,6 +867,18 @@ define <vscale x 2 x double> @bitcast_nxv16i8_to_nxv2f64(<vscale x 16 x i8> %v)
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -378,6 +887,18 @@ define <vscale x 2 x double> @bitcast_nxv8i16_to_nxv2f64(<vscale x 8 x i16> %v)
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -386,6 +907,18 @@ define <vscale x 2 x double> @bitcast_nxv4i32_to_nxv2f64(<vscale x 4 x i32> %v)
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -394,6 +927,17 @@ define <vscale x 2 x double> @bitcast_nxv2i64_to_nxv2f64(<vscale x 2 x i64> %v)
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -402,6 +946,18 @@ define <vscale x 2 x double> @bitcast_nxv8f16_to_nxv2f64(<vscale x 8 x half> %v)
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -410,6 +966,18 @@ define <vscale x 2 x double> @bitcast_nxv4f32_to_nxv2f64(<vscale x 4 x float> %v
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -418,6 +986,18 @@ define <vscale x 2 x double> @bitcast_nxv8bf16_to_nxv2f64(<vscale x 8 x bfloat>
 ; CHECK-LABEL: bitcast_nxv8bf16_to_nxv2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8bf16_to_nxv2f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x bfloat> %v to <vscale x 2 x double>
   ret <vscale x 2 x double> %bc
 }
@@ -430,6 +1010,18 @@ define <vscale x 8 x bfloat> @bitcast_nxv16i8_to_nxv8bf16(<vscale x 16 x i8> %v)
 ; CHECK-LABEL: bitcast_nxv16i8_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv16i8_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 16 x i8> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -438,6 +1030,17 @@ define <vscale x 8 x bfloat> @bitcast_nxv8i16_to_nxv8bf16(<vscale x 8 x i16> %v)
 ; CHECK-LABEL: bitcast_nxv8i16_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i16_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i16> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -446,6 +1049,18 @@ define <vscale x 8 x bfloat> @bitcast_nxv4i32_to_nxv8bf16(<vscale x 4 x i32> %v)
 ; CHECK-LABEL: bitcast_nxv4i32_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i32_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i32> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -454,6 +1069,18 @@ define <vscale x 8 x bfloat> @bitcast_nxv2i64_to_nxv8bf16(<vscale x 2 x i64> %v)
 ; CHECK-LABEL: bitcast_nxv2i64_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i64_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i64> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -462,6 +1089,17 @@ define <vscale x 8 x bfloat> @bitcast_nxv8f16_to_nxv8bf16(<vscale x 8 x half> %v
 ; CHECK-LABEL: bitcast_nxv8f16_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8f16_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x half> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -470,6 +1108,18 @@ define <vscale x 8 x bfloat> @bitcast_nxv4f32_to_nxv8bf16(<vscale x 4 x float> %
 ; CHECK-LABEL: bitcast_nxv4f32_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f32_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x float> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -478,6 +1128,18 @@ define <vscale x 8 x bfloat> @bitcast_nxv2f64_to_nxv8bf16(<vscale x 2 x double>
 ; CHECK-LABEL: bitcast_nxv2f64_to_nxv8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f64_to_nxv8bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x double> %v to <vscale x 8 x bfloat>
   ret <vscale x 8 x bfloat> %bc
 }
@@ -498,6 +1160,18 @@ define <vscale x 8 x i8> @bitcast_nxv4i16_to_nxv8i8(<vscale x 4 x i16> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -514,6 +1188,18 @@ define <vscale x 8 x i8> @bitcast_nxv2i32_to_nxv8i8(<vscale x 2 x i32> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -523,6 +1209,19 @@ define <vscale x 8 x i8> @bitcast_nxv1i64_to_nxv8i8(<vscale x 1 x i64> %v) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -539,6 +1238,18 @@ define <vscale x 8 x i8> @bitcast_nxv4f16_to_nxv8i8(<vscale x 4 x half> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -555,6 +1266,18 @@ define <vscale x 8 x i8> @bitcast_nxv2f32_to_nxv8i8(<vscale x 2 x float> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -564,6 +1287,19 @@ define <vscale x 8 x i8> @bitcast_nxv1f64_to_nxv8i8(<vscale x 1 x double> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -580,6 +1316,18 @@ define <vscale x 8 x i8> @bitcast_nxv4bf16_to_nxv8i8(<vscale x 4 x bfloat> %v) #
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv8i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %bc
 }
@@ -600,6 +1348,18 @@ define <vscale x 4 x i16> @bitcast_nxv8i8_to_nxv4i16(<vscale x 8 x i8> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -616,6 +1376,18 @@ define <vscale x 4 x i16> @bitcast_nxv2i32_to_nxv4i16(<vscale x 2 x i32> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -625,6 +1397,19 @@ define <vscale x 4 x i16> @bitcast_nxv1i64_to_nxv4i16(<vscale x 1 x i64> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -633,6 +1418,18 @@ define <vscale x 4 x i16> @bitcast_nxv4f16_to_nxv4i16(<vscale x 4 x half> %v) #0
 ; CHECK-LABEL: bitcast_nxv4f16_to_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -649,6 +1446,18 @@ define <vscale x 4 x i16> @bitcast_nxv2f32_to_nxv4i16(<vscale x 2 x float> %v) #
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -658,6 +1467,19 @@ define <vscale x 4 x i16> @bitcast_nxv1f64_to_nxv4i16(<vscale x 1 x double> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -666,6 +1488,18 @@ define <vscale x 4 x i16> @bitcast_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %v)
 ; CHECK-LABEL: bitcast_nxv4bf16_to_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv4i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %bc
 }
@@ -686,6 +1520,18 @@ define <vscale x 2 x i32> @bitcast_nxv8i8_to_nxv2i32(<vscale x 8 x i8> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -702,6 +1548,18 @@ define <vscale x 2 x i32> @bitcast_nxv4i16_to_nxv2i32(<vscale x 4 x i16> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -711,6 +1569,19 @@ define <vscale x 2 x i32> @bitcast_nxv1i64_to_nxv2i32(<vscale x 1 x i64> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -727,6 +1598,18 @@ define <vscale x 2 x i32> @bitcast_nxv4f16_to_nxv2i32(<vscale x 4 x half> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -735,6 +1618,18 @@ define <vscale x 2 x i32> @bitcast_nxv2f32_to_nxv2i32(<vscale x 2 x float> %v) #
 ; CHECK-LABEL: bitcast_nxv2f32_to_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -744,6 +1639,19 @@ define <vscale x 2 x i32> @bitcast_nxv1f64_to_nxv2i32(<vscale x 1 x double> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -760,6 +1668,18 @@ define <vscale x 2 x i32> @bitcast_nxv4bf16_to_nxv2i32(<vscale x 4 x bfloat> %v)
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv2i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bc
 }
@@ -773,6 +1693,19 @@ define <vscale x 1 x i64> @bitcast_nxv8i8_to_nxv1i64(<vscale x 8 x i8> %v) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -782,6 +1715,19 @@ define <vscale x 1 x i64> @bitcast_nxv4i16_to_nxv1i64(<vscale x 4 x i16> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -791,6 +1737,19 @@ define <vscale x 1 x i64> @bitcast_nxv2i32_to_nxv1i64(<vscale x 2 x i32> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -800,6 +1759,24 @@ define <vscale x 1 x i64> @bitcast_nxv4f16_to_nxv1i64(<vscale x 4 x half> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-3
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #3
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -809,6 +1786,23 @@ define <vscale x 1 x i64> @bitcast_nxv2f32_to_nxv1i64(<vscale x 2 x float> %v) #
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-3
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
+; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #3
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -817,6 +1811,17 @@ define <vscale x 1 x i64> @bitcast_nxv1f64_to_nxv1i64(<vscale x 1 x double> %v)
 ; CHECK-LABEL: bitcast_nxv1f64_to_nxv1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -826,6 +1831,24 @@ define <vscale x 1 x i64> @bitcast_nxv4bf16_to_nxv1i64(<vscale x 4 x bfloat> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv1i64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-3
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #3
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 1 x i64>
   ret <vscale x 1 x i64> %bc
 }
@@ -846,6 +1869,18 @@ define <vscale x 4 x half> @bitcast_nxv8i8_to_nxv4f16(<vscale x 8 x i8> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -854,6 +1889,18 @@ define <vscale x 4 x half> @bitcast_nxv4i16_to_nxv4f16(<vscale x 4 x i16> %v) #0
 ; CHECK-LABEL: bitcast_nxv4i16_to_nxv4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -870,6 +1917,18 @@ define <vscale x 4 x half> @bitcast_nxv2i32_to_nxv4f16(<vscale x 2 x i32> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -879,6 +1938,19 @@ define <vscale x 4 x half> @bitcast_nxv1i64_to_nxv4f16(<vscale x 1 x i64> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -895,6 +1967,18 @@ define <vscale x 4 x half> @bitcast_nxv2f32_to_nxv4f16(<vscale x 2 x float> %v)
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -904,6 +1988,19 @@ define <vscale x 4 x half> @bitcast_nxv1f64_to_nxv4f16(<vscale x 1 x double> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -912,6 +2009,17 @@ define <vscale x 4 x half> @bitcast_nxv4bf16_to_nxv4f16(<vscale x 4 x bfloat> %v
 ; CHECK-LABEL: bitcast_nxv4bf16_to_nxv4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv4f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 4 x half>
   ret <vscale x 4 x half> %bc
 }
@@ -932,6 +2040,18 @@ define <vscale x 2 x float> @bitcast_nxv8i8_to_nxv2f32(<vscale x 8 x i8> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -948,6 +2068,18 @@ define <vscale x 2 x float> @bitcast_nxv4i16_to_nxv2f32(<vscale x 4 x i16> %v) #
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -956,6 +2088,18 @@ define <vscale x 2 x float> @bitcast_nxv2i32_to_nxv2f32(<vscale x 2 x i32> %v) #
 ; CHECK-LABEL: bitcast_nxv2i32_to_nxv2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -965,6 +2109,19 @@ define <vscale x 2 x float> @bitcast_nxv1i64_to_nxv2f32(<vscale x 1 x i64> %v) #
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -981,6 +2138,18 @@ define <vscale x 2 x float> @bitcast_nxv4f16_to_nxv2f32(<vscale x 4 x half> %v)
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -990,6 +2159,19 @@ define <vscale x 2 x float> @bitcast_nxv1f64_to_nxv2f32(<vscale x 1 x double> %v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -1006,6 +2188,18 @@ define <vscale x 2 x float> @bitcast_nxv4bf16_to_nxv2f32(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv2f32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 2 x float>
   ret <vscale x 2 x float> %bc
 }
@@ -1019,6 +2213,19 @@ define <vscale x 1 x double> @bitcast_nxv8i8_to_nxv1f64(<vscale x 8 x i8> %v) #0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1028,6 +2235,19 @@ define <vscale x 1 x double> @bitcast_nxv4i16_to_nxv1f64(<vscale x 4 x i16> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1037,6 +2257,19 @@ define <vscale x 1 x double> @bitcast_nxv2i32_to_nxv1f64(<vscale x 2 x i32> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1045,6 +2278,17 @@ define <vscale x 1 x double> @bitcast_nxv1i64_to_nxv1f64(<vscale x 1 x i64> %v)
 ; CHECK-LABEL: bitcast_nxv1i64_to_nxv1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1054,6 +2298,24 @@ define <vscale x 1 x double> @bitcast_nxv4f16_to_nxv1f64(<vscale x 4 x half> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-3
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #3
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1063,6 +2325,23 @@ define <vscale x 1 x double> @bitcast_nxv2f32_to_nxv1f64(<vscale x 2 x float> %v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-3
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
+; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #3
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1072,6 +2351,24 @@ define <vscale x 1 x double> @bitcast_nxv4bf16_to_nxv1f64(<vscale x 4 x bfloat>
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4bf16_to_nxv1f64:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-3
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #3
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x bfloat> %v to <vscale x 1 x double>
   ret <vscale x 1 x double> %bc
 }
@@ -1092,6 +2389,18 @@ define <vscale x 4 x bfloat> @bitcast_nxv8i8_to_nxv4bf16(<vscale x 8 x i8> %v) #
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv8i8_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 8 x i8> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1100,6 +2409,18 @@ define <vscale x 4 x bfloat> @bitcast_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %v)
 ; CHECK-LABEL: bitcast_nxv4i16_to_nxv4bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i16_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i16> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1116,6 +2437,18 @@ define <vscale x 4 x bfloat> @bitcast_nxv2i32_to_nxv4bf16(<vscale x 2 x i32> %v)
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i32_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i32> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1125,6 +2458,19 @@ define <vscale x 4 x bfloat> @bitcast_nxv1i64_to_nxv4bf16(<vscale x 1 x i64> %v)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i64_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i64> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1133,6 +2479,17 @@ define <vscale x 4 x bfloat> @bitcast_nxv4f16_to_nxv4bf16(<vscale x 4 x half> %v
 ; CHECK-LABEL: bitcast_nxv4f16_to_nxv4bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4f16_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x half> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1149,6 +2506,18 @@ define <vscale x 4 x bfloat> @bitcast_nxv2f32_to_nxv4bf16(<vscale x 2 x float> %
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f32_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x float> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1158,6 +2527,19 @@ define <vscale x 4 x bfloat> @bitcast_nxv1f64_to_nxv4bf16(<vscale x 1 x double>
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1f64_to_nxv4bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x double> %v to <vscale x 4 x bfloat>
   ret <vscale x 4 x bfloat> %bc
 }
@@ -1178,6 +2560,18 @@ define <vscale x 4 x i8> @bitcast_nxv2i16_to_nxv4i8(<vscale x 2 x i16> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv4i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %bc
 }
@@ -1188,6 +2582,20 @@ define <vscale x 4 x i8> @bitcast_nxv1i32_to_nxv4i8(<vscale x 1 x i32> %v) #0 {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i32_to_nxv4i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i32> %v to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %bc
 }
@@ -1204,6 +2612,18 @@ define <vscale x 4 x i8> @bitcast_nxv2f16_to_nxv4i8(<vscale x 2 x half> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv4i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %bc
 }
@@ -1222,6 +2642,18 @@ define <vscale x 4 x i8> @bitcast_nxv2bf16_to_nxv4i8(<vscale x 2 x bfloat> %v) #
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv4i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %bc
 }
@@ -1242,6 +2674,18 @@ define <vscale x 2 x i16> @bitcast_nxv4i8_to_nxv2i16(<vscale x 4 x i8> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i8_to_nxv2i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i8> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
 }
@@ -1252,6 +2696,20 @@ define <vscale x 2 x i16> @bitcast_nxv1i32_to_nxv2i16(<vscale x 1 x i32> %v) #0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i32_to_nxv2i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i32> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
 }
@@ -1260,6 +2718,18 @@ define <vscale x 2 x i16> @bitcast_nxv2f16_to_nxv2i16(<vscale x 2 x half> %v) #0
 ; CHECK-LABEL: bitcast_nxv2f16_to_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv2i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
 }
@@ -1270,6 +2740,18 @@ define <vscale x 2 x i16> @bitcast_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %v)
 ; CHECK-LABEL: bitcast_nxv2bf16_to_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv2i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %bc
 }
@@ -1284,6 +2766,20 @@ define <vscale x 1 x i32> @bitcast_nxv4i8_to_nxv1i32(<vscale x 4 x i8> %v) #0 {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i8_to_nxv1i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i8> %v to <vscale x 1 x i32>
   ret <vscale x 1 x i32> %bc
 }
@@ -1294,6 +2790,20 @@ define <vscale x 1 x i32> @bitcast_nxv2i16_to_nxv1i32(<vscale x 2 x i16> %v) #0
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv1i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 1 x i32>
   ret <vscale x 1 x i32> %bc
 }
@@ -1310,6 +2820,21 @@ define <vscale x 1 x i32> @bitcast_nxv2f16_to_nxv1i32(<vscale x 2 x half> %v) #0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv1i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-2
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #2
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 1 x i32>
   ret <vscale x 1 x i32> %bc
 }
@@ -1328,6 +2853,21 @@ define <vscale x 1 x i32> @bitcast_nxv2bf16_to_nxv1i32(<vscale x 2 x bfloat> %v)
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv1i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-2
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #2
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 1 x i32>
   ret <vscale x 1 x i32> %bc
 }
@@ -1348,6 +2888,18 @@ define <vscale x 2 x half> @bitcast_nxv4i8_to_nxv2f16(<vscale x 4 x i8> %v) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i8_to_nxv2f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i8> %v to <vscale x 2 x half>
   ret <vscale x 2 x half> %bc
 }
@@ -1356,6 +2908,18 @@ define <vscale x 2 x half> @bitcast_nxv2i16_to_nxv2f16(<vscale x 2 x i16> %v) #0
 ; CHECK-LABEL: bitcast_nxv2i16_to_nxv2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv2f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 2 x half>
   ret <vscale x 2 x half> %bc
 }
@@ -1367,6 +2931,17 @@ define <vscale x 2 x half> @bitcast_nxv2bf16_to_nxv2f16(<vscale x 2 x bfloat> %v
 ; CHECK-LABEL: bitcast_nxv2bf16_to_nxv2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2bf16_to_nxv2f16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x bfloat> %v to <vscale x 2 x half>
   ret <vscale x 2 x half> %bc
 }
@@ -1397,6 +2972,18 @@ define <vscale x 2 x bfloat> @bitcast_nxv4i8_to_nxv2bf16(<vscale x 4 x i8> %v) #
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv4i8_to_nxv2bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 4 x i8> %v to <vscale x 2 x bfloat>
   ret <vscale x 2 x bfloat> %bc
 }
@@ -1405,6 +2992,18 @@ define <vscale x 2 x bfloat> @bitcast_nxv2i16_to_nxv2bf16(<vscale x 2 x i16> %v)
 ; CHECK-LABEL: bitcast_nxv2i16_to_nxv2bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i16_to_nxv2bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i16> %v to <vscale x 2 x bfloat>
   ret <vscale x 2 x bfloat> %bc
 }
@@ -1415,6 +3014,17 @@ define <vscale x 2 x bfloat> @bitcast_nxv2f16_to_nxv2bf16(<vscale x 2 x half> %v
 ; CHECK-LABEL: bitcast_nxv2f16_to_nxv2bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2f16_to_nxv2bf16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x half> %v to <vscale x 2 x bfloat>
   ret <vscale x 2 x bfloat> %bc
 }
@@ -1432,6 +3042,21 @@ define <vscale x 2 x i8> @bitcast_nxv1i16_to_nxv2i8(<vscale x 1 x i16> %v) #0 {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv1i16_to_nxv2i8:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
+; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
+; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 1 x i16> %v to <vscale x 2 x i8>
   ret <vscale x 2 x i8> %bc
 }
@@ -1450,6 +3075,21 @@ define <vscale x 1 x i16> @bitcast_nxv2i8_to_nxv1i16(<vscale x 2 x i8> %v) #0 {
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_nxv2i8_to_nxv1i16:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %bc = bitcast <vscale x 2 x i8> %v to <vscale x 1 x i16>
   ret <vscale x 1 x i16> %bc
 }
@@ -1483,6 +3123,19 @@ define <vscale x 2 x i32> @bitcast_short_float_to_i32(<vscale x 2 x double> %v)
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_short_float_to_i32:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK_BE-NEXT:    st1w { z0.s }, p1, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %trunc = fptrunc <vscale x 2 x double> %v to <vscale x 2 x float>
   %bitcast = bitcast <vscale x 2 x float> %trunc to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %bitcast
@@ -1494,6 +3147,19 @@ define <vscale x 2 x double> @bitcast_short_i32_to_float(<vscale x 2 x i64> %v)
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_short_i32_to_float:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
+; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    fcvt z0.d, p0/m, z0.s
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %trunc = trunc <vscale x 2 x i64> %v to <vscale x 2 x i32>
   %bitcast = bitcast <vscale x 2 x i32> %trunc to <vscale x 2 x float>
   %extended = fpext <vscale x 2 x float> %bitcast to <vscale x 2 x double>
@@ -1513,6 +3179,19 @@ define <vscale x 2 x float> @bitcast_short_half_to_float(<vscale x 4 x half> %v)
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK_BE-LABEL: bitcast_short_half_to_float:
+; CHECK_BE:       // %bb.0:
+; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK_BE-NEXT:    addvl sp, sp, #-1
+; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    fadd z0.h, p0/m, z0.h, z0.h
+; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    addvl sp, sp, #1
+; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK_BE-NEXT:    ret
   %add = fadd <vscale x 4 x half> %v, %v
   %bitcast = bitcast <vscale x 4 x half> %add to <vscale x 2 x float>
   ret <vscale x 2 x float> %bitcast


        


More information about the llvm-commits mailing list