[llvm] 3ddc9f0 - [AArch64] Additional shuffle subvector-extract cost tests. NFC

Thu Jan 2 02:13:55 PST 2025

Author: David Green
Date: 2025-01-02T10:13:51Z
New Revision: 3ddc9f06ae61e916b333b096cef3560f0f5c6272

URL: https://github.com/llvm/llvm-project/commit/3ddc9f06ae61e916b333b096cef3560f0f5c6272
DIFF: https://github.com/llvm/llvm-project/commit/3ddc9f06ae61e916b333b096cef3560f0f5c6272.diff

LOG: [AArch64] Additional shuffle subvector-extract cost tests. NFC

A Phase Ordering test for intrinsic shuffles is also added, showing a recent
regression from vector combining.

Added: 
    llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
    llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll

Modified: 
    llvm/test/CodeGen/AArch64/shuffle-select.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
new file mode 100644
index 00000000000000..50356196b83810

--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-extract.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+define void @extract_half() {
+; CHECK-LABEL: 'extract_half'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v2i8_lo = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 0>
+  %v2i8_hi = shufflevector <2 x i8> poison, <2 x i8> poison, <1 x i32> <i32 1>
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 1, i32 2>
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i16_lo = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 0>
+  %v2i16_hi = shufflevector <2 x i16> poison, <2 x i16> poison, <1 x i32> <i32 1>
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 1, i32 2>
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i32_lo = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 0>
+  %v2i32_hi = shufflevector <2 x i32> poison, <2 x i32> poison, <1 x i32> <i32 1>
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %v2i64_lo = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 0>
+  %v2i64_hi = shufflevector <2 x i64> poison, <2 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 1, i32 2>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+
+  ret void
+}
+
+define void @extract_qtr() {
+; CHECK-LABEL: 'extract_qtr'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v4i8_lo = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 0>
+  %v4i8_mi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 1>
+  %v4i8_hi = shufflevector <4 x i8> poison, <4 x i8> poison, <1 x i32> <i32 2>
+  %v8i8_lo = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+  %v8i8_mi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+  %v8i8_hi = shufflevector <8 x i8> poison, <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+  %v16i8_lo = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i8_mi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i8_hi = shufflevector <16 x i8> poison, <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i16_lo = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 0>
+  %v4i16_mi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 1>
+  %v4i16_hi = shufflevector <4 x i16> poison, <4 x i16> poison, <1 x i32> <i32 2>
+  %v8i16_lo = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+  %v8i16_mi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+  %v8i16_hi = shufflevector <8 x i16> poison, <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+  %v16i16_lo = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i16_mi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i16_hi = shufflevector <16 x i16> poison, <16 x i16> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i32_lo = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 0>
+  %v4i32_mi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 1>
+  %v4i32_hi = shufflevector <4 x i32> poison, <4 x i32> poison, <1 x i32> <i32 2>
+  %v8i32_lo = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+  %v8i32_mi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+  %v8i32_hi = shufflevector <8 x i32> poison, <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+  %v16i32_lo = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v16i32_mi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v16i32_hi = shufflevector <16 x i32> poison, <16 x i32> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+
+  %v4i64_lo = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 0>
+  %v4i64_mi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 1>
+  %v4i64_hi = shufflevector <4 x i64> poison, <4 x i64> poison, <1 x i32> <i32 2>
+  %v8i64_lo = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %v8i64_mi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %v8i64_hi = shufflevector <8 x i64> poison, <8 x i64> poison, <2 x i32> <i32 4, i32 5>
+
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AArch64/shuffle-select.ll b/llvm/test/CodeGen/AArch64/shuffle-select.ll
index 25a935f067bd6c..eeccaa170397df 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-select.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-select.ll
@@ -28,6 +28,32 @@ define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
   ret <16 x i8> %tmp0
 }
 
+define <16 x i8> @sel_v16i8_poison(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <16 x i8> %tmp0
+}
+
+define <16 x i8> @sel_v16i8_unregular(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: sel_v16i8_unregular:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-NEXT:    ret
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %tmp0
+}
+
 define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: sel_v4i16:
 ; CHECK:       // %bb.0:
@@ -41,9 +67,9 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: sel_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
@@ -95,9 +121,9 @@ define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
 define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: sel_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI8_0
+; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
new file mode 100644
index 00000000000000..7d9524420286d6
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll
@@ -0,0 +1,804 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+%struct.cmplx_int16_t = type { i16, i16 }
+%struct.compressed_data_8bit = type { i8, [24 x i8] }
+
+define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(
+; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr nocapture noundef readonly [[SRC:%.*]], ptr nocapture noundef writeonly [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0
+; CHECK-NEXT:    br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP31_NOT:%.*]] = icmp eq ptr [[SCALE]], null
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT58:%.*]] = zext i32 [[N_PRB]] to i64
+; CHECK-NEXT:    br i1 [[CMP31_NOT]], label %[[FOR_BODY_US:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[INDVARS_IV55:%.*]] = phi i64 [ [[INDVARS_IV_NEXT56:%.*]], %[[FOR_BODY_US]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052_US:%.*]] = phi ptr [ [[DST_ADDR_1_US:%.*]], %[[FOR_BODY_US]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT:%.*]], ptr [[SRC]], i64 [[INDVARS_IV55]]
+; CHECK-NEXT:    [[MANTISSA_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[MANTISSA_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I59_US:%.*]] = sext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 9
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I56_US:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15_US:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 17
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15_US]], align 1
+; CHECK-NEXT:    [[VMOVL_I_US:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
+; CHECK-NEXT:    [[CONV_US:%.*]] = sext i8 [[TMP3]] to i16
+; CHECK-NEXT:    [[MUL_US:%.*]] = shl nsw i16 [[CONV_US]], 1
+; CHECK-NEXT:    [[VECINIT_I79_US:%.*]] = insertelement <8 x i16> poison, i16 [[MUL_US]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86_US:%.*]] = shufflevector <8 x i16> [[VECINIT_I79_US]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I59_US]]
+; CHECK-NEXT:    [[MUL_I74_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I56_US]]
+; CHECK-NEXT:    [[MUL_I_US:%.*]] = mul <8 x i16> [[VECINIT7_I86_US]], [[VMOVL_I_US]]
+; CHECK-NEXT:    store <8 x i16> [[MUL_I87_US]], ptr [[DST_ADDR_052_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR47_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[MUL_I74_US]], ptr [[ADD_PTR47_US]], align 2
+; CHECK-NEXT:    [[ADD_PTR50_US:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[MUL_I_US]], ptr [[ADD_PTR50_US]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1_US]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052_US]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT56]] = add nuw nsw i64 [[INDVARS_IV55]], 1
+; CHECK-NEXT:    [[EXITCOND59_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT56]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND59_NOT]], label %[[FOR_END]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[DST_ADDR_052:%.*]] = phi ptr [ [[DST_ADDR_1:%.*]], %[[FOR_BODY]] ], [ [[DST]], %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP_COERCE_050:%.*]] = phi i64 [ [[AGG_TMP_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_049:%.*]] = phi i64 [ [[AGG_TMP42_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_048:%.*]] = phi i64 [ [[AGG_TMP37_COERCE_0_INSERT_INSERT:%.*]], %[[FOR_BODY]] ], [ undef, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_COMPRESSED_DATA_8BIT]], ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[MANTISSA:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[MANTISSA]], align 1
+; CHECK-NEXT:    [[VMOVL_I59:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 9
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX7]], align 1
+; CHECK-NEXT:    [[VMOVL_I56:%.*]] = sext <8 x i8> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 17
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX15]], align 1
+; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <8 x i8> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i16 [[CONV]], 1
+; CHECK-NEXT:    [[VECINIT_I79:%.*]] = insertelement <8 x i16> poison, i16 [[MUL]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I86:%.*]] = shufflevector <8 x i16> [[VECINIT_I79]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[MUL_I87:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I59]]
+; CHECK-NEXT:    [[MUL_I74:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I56]]
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[VECINIT7_I86]], [[VMOVL_I]]
+; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP_COERCE_050]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP_COERCE_0_INSERT_MASK]], [[AGG_TMP_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL33:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I87]], i64 [[AGG_TMP_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    store <8 x i16> [[CALL33]], ptr [[DST_ADDR_052]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP37_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP37_COERCE_048]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP37_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP37_COERCE_0_INSERT_MASK]], [[AGG_TMP37_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL38:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I74]], i64 [[AGG_TMP37_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 16
+; CHECK-NEXT:    store <8 x i16> [[CALL38]], ptr [[ARRAYIDX39]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_SROA_0_0_COPYLOAD:%.*]] = load i32, ptr [[SCALE]], align 2
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_EXT:%.*]] = zext i32 [[AGG_TMP42_SROA_0_0_COPYLOAD]] to i64
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_MASK:%.*]] = and i64 [[AGG_TMP42_COERCE_049]], -4294967296
+; CHECK-NEXT:    [[AGG_TMP42_COERCE_0_INSERT_INSERT]] = or disjoint i64 [[AGG_TMP42_COERCE_0_INSERT_MASK]], [[AGG_TMP42_COERCE_0_INSERT_EXT]]
+; CHECK-NEXT:    [[CALL43:%.*]] = tail call fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef [[MUL_I]], i64 [[AGG_TMP42_COERCE_0_INSERT_INSERT]])
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 32
+; CHECK-NEXT:    store <8 x i16> [[CALL43]], ptr [[ARRAYIDX44]], align 2
+; CHECK-NEXT:    [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %__p0.addr.i75 = alloca <8 x i16>, align 16
+  %__p1.addr.i76 = alloca i16, align 2
+  %__ret.i77 = alloca <8 x i16>, align 16
+  %.compoundliteral.i78 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <8 x i16>, align 16
+  %__p1.addr.i63 = alloca i16, align 2
+  %__ret.i64 = alloca <8 x i16>, align 16
+  %.compoundliteral.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i60 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca i16, align 2
+  %__ret.i61 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i57 = alloca <8 x i8>, align 8
+  %__ret.i58 = alloca <8 x i16>, align 16
+  %__p0.addr.i54 = alloca <8 x i8>, align 8
+  %__ret.i55 = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i8>, align 8
+  %__ret.i = alloca <8 x i16>, align 16
+  %n_prb.addr = alloca i32, align 4
+  %src.addr = alloca ptr, align 8
+  %dst.addr = alloca ptr, align 8
+  %scale.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %prb_comp_in = alloca [3 x <8 x i16>], align 16
+  %__ret = alloca <8 x i8>, align 8
+  %tmp = alloca <8 x i8>, align 8
+  %__ret3 = alloca <8 x i8>, align 8
+  %tmp8 = alloca <8 x i8>, align 8
+  %__ret11 = alloca <8 x i8>, align 8
+  %tmp16 = alloca <8 x i8>, align 8
+  %prb_decomp = alloca [3 x <8 x i16>], align 16
+  %scaling_factor = alloca i16, align 2
+  %__s1 = alloca <8 x i16>, align 16
+  %agg.tmp = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp.coerce = alloca i64, align 8
+  %__s135 = alloca <8 x i16>, align 16
+  %agg.tmp37 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp37.coerce = alloca i64, align 8
+  %__s140 = alloca <8 x i16>, align 16
+  %agg.tmp42 = alloca %struct.cmplx_int16_t, align 2
+  %agg.tmp42.coerce = alloca i64, align 8
+  %__s145 = alloca <8 x i16>, align 16
+  %__s148 = alloca <8 x i16>, align 16
+  %__s151 = alloca <8 x i16>, align 16
+  store i32 %n_prb, ptr %n_prb.addr, align 4
+  store ptr %src, ptr %src.addr, align 8
+  store ptr %dst, ptr %dst.addr, align 8
+  store ptr %scale, ptr %scale.addr, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %n_prb.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %src.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = zext i32 %3 to i64
+  %arrayidx = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %2, i64 %idxprom
+  %mantissa = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx, i32 0, i32 1
+  %arrayidx1 = getelementptr inbounds [24 x i8], ptr %mantissa, i64 0, i64 0
+  %4 = load <8 x i8>, ptr %arrayidx1, align 1
+  store <8 x i8> %4, ptr %__ret, align 8
+  %5 = load <8 x i8>, ptr %__ret, align 8
+  store <8 x i8> %5, ptr %tmp, align 8
+  %6 = load <8 x i8>, ptr %tmp, align 8
+  store <8 x i8> %6, ptr %__p0.addr.i57, align 8
+  %7 = load <8 x i8>, ptr %__p0.addr.i57, align 8
+  %vmovl.i59 = sext <8 x i8> %7 to <8 x i16>
+  store <8 x i16> %vmovl.i59, ptr %__ret.i58, align 16
+  %8 = load <8 x i16>, ptr %__ret.i58, align 16
+  %arrayidx2 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  store <8 x i16> %8, ptr %arrayidx2, align 16
+  %9 = load ptr, ptr %src.addr, align 8
+  %10 = load i32, ptr %i, align 4
+  %idxprom4 = zext i32 %10 to i64
+  %arrayidx5 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %9, i64 %idxprom4
+  %mantissa6 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx5, i32 0, i32 1
+  %arrayidx7 = getelementptr inbounds [24 x i8], ptr %mantissa6, i64 0, i64 8
+  %11 = load <8 x i8>, ptr %arrayidx7, align 1
+  store <8 x i8> %11, ptr %__ret3, align 8
+  %12 = load <8 x i8>, ptr %__ret3, align 8
+  store <8 x i8> %12, ptr %tmp8, align 8
+  %13 = load <8 x i8>, ptr %tmp8, align 8
+  store <8 x i8> %13, ptr %__p0.addr.i54, align 8
+  %14 = load <8 x i8>, ptr %__p0.addr.i54, align 8
+  %vmovl.i56 = sext <8 x i8> %14 to <8 x i16>
+  store <8 x i16> %vmovl.i56, ptr %__ret.i55, align 16
+  %15 = load <8 x i16>, ptr %__ret.i55, align 16
+  %arrayidx10 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  store <8 x i16> %15, ptr %arrayidx10, align 16
+  %16 = load ptr, ptr %src.addr, align 8
+  %17 = load i32, ptr %i, align 4
+  %idxprom12 = zext i32 %17 to i64
+  %arrayidx13 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %16, i64 %idxprom12
+  %mantissa14 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx13, i32 0, i32 1
+  %arrayidx15 = getelementptr inbounds [24 x i8], ptr %mantissa14, i64 0, i64 16
+  %18 = load <8 x i8>, ptr %arrayidx15, align 1
+  store <8 x i8> %18, ptr %__ret11, align 8
+  %19 = load <8 x i8>, ptr %__ret11, align 8
+  store <8 x i8> %19, ptr %tmp16, align 8
+  %20 = load <8 x i8>, ptr %tmp16, align 8
+  store <8 x i8> %20, ptr %__p0.addr.i, align 8
+  %21 = load <8 x i8>, ptr %__p0.addr.i, align 8
+  %vmovl.i = sext <8 x i8> %21 to <8 x i16>
+  store <8 x i16> %vmovl.i, ptr %__ret.i, align 16
+  %22 = load <8 x i16>, ptr %__ret.i, align 16
+  %arrayidx18 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  store <8 x i16> %22, ptr %arrayidx18, align 16
+  %23 = load ptr, ptr %src.addr, align 8
+  %24 = load i32, ptr %i, align 4
+  %idxprom19 = zext i32 %24 to i64
+  %arrayidx20 = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %23, i64 %idxprom19
+  %exp = getelementptr inbounds nuw %struct.compressed_data_8bit, ptr %arrayidx20, i32 0, i32 0
+  %25 = load i8, ptr %exp, align 1
+  %conv = sext i8 %25 to i32
+  %mul = mul nsw i32 %conv, 2
+  %conv21 = trunc i32 %mul to i16
+  store i16 %conv21, ptr %scaling_factor, align 2
+  %arrayidx22 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 0
+  %26 = load <8 x i16>, ptr %arrayidx22, align 16
+  %27 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %26, ptr %__p0.addr.i75, align 16
+  store i16 %27, ptr %__p1.addr.i76, align 2
+  %28 = load <8 x i16>, ptr %__p0.addr.i75, align 16
+  %29 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit.i79 = insertelement <8 x i16> poison, i16 %29, i32 0
+  %30 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit1.i80 = insertelement <8 x i16> %vecinit.i79, i16 %30, i32 1
+  %31 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit2.i81 = insertelement <8 x i16> %vecinit1.i80, i16 %31, i32 2
+  %32 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit3.i82 = insertelement <8 x i16> %vecinit2.i81, i16 %32, i32 3
+  %33 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit4.i83 = insertelement <8 x i16> %vecinit3.i82, i16 %33, i32 4
+  %34 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit5.i84 = insertelement <8 x i16> %vecinit4.i83, i16 %34, i32 5
+  %35 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit6.i85 = insertelement <8 x i16> %vecinit5.i84, i16 %35, i32 6
+  %36 = load i16, ptr %__p1.addr.i76, align 2
+  %vecinit7.i86 = insertelement <8 x i16> %vecinit6.i85, i16 %36, i32 7
+  store <8 x i16> %vecinit7.i86, ptr %.compoundliteral.i78, align 16
+  %37 = load <8 x i16>, ptr %.compoundliteral.i78, align 16
+  %mul.i87 = mul <8 x i16> %28, %37
+  store <8 x i16> %mul.i87, ptr %__ret.i77, align 16
+  %38 = load <8 x i16>, ptr %__ret.i77, align 16
+  %arrayidx24 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  store <8 x i16> %38, ptr %arrayidx24, align 16
+  %arrayidx25 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 1
+  %39 = load <8 x i16>, ptr %arrayidx25, align 16
+  %40 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %39, ptr %__p0.addr.i62, align 16
+  store i16 %40, ptr %__p1.addr.i63, align 2
+  %41 = load <8 x i16>, ptr %__p0.addr.i62, align 16
+  %42 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit.i66 = insertelement <8 x i16> poison, i16 %42, i32 0
+  %43 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit1.i67 = insertelement <8 x i16> %vecinit.i66, i16 %43, i32 1
+  %44 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit2.i68 = insertelement <8 x i16> %vecinit1.i67, i16 %44, i32 2
+  %45 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit3.i69 = insertelement <8 x i16> %vecinit2.i68, i16 %45, i32 3
+  %46 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit4.i70 = insertelement <8 x i16> %vecinit3.i69, i16 %46, i32 4
+  %47 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit5.i71 = insertelement <8 x i16> %vecinit4.i70, i16 %47, i32 5
+  %48 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit6.i72 = insertelement <8 x i16> %vecinit5.i71, i16 %48, i32 6
+  %49 = load i16, ptr %__p1.addr.i63, align 2
+  %vecinit7.i73 = insertelement <8 x i16> %vecinit6.i72, i16 %49, i32 7
+  store <8 x i16> %vecinit7.i73, ptr %.compoundliteral.i65, align 16
+  %50 = load <8 x i16>, ptr %.compoundliteral.i65, align 16
+  %mul.i74 = mul <8 x i16> %41, %50
+  store <8 x i16> %mul.i74, ptr %__ret.i64, align 16
+  %51 = load <8 x i16>, ptr %__ret.i64, align 16
+  %arrayidx27 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  store <8 x i16> %51, ptr %arrayidx27, align 16
+  %arrayidx28 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_comp_in, i64 0, i64 2
+  %52 = load <8 x i16>, ptr %arrayidx28, align 16
+  %53 = load i16, ptr %scaling_factor, align 2
+  store <8 x i16> %52, ptr %__p0.addr.i60, align 16
+  store i16 %53, ptr %__p1.addr.i, align 2
+  %54 = load <8 x i16>, ptr %__p0.addr.i60, align 16
+  %55 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %55, i32 0
+  %56 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %56, i32 1
+  %57 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %57, i32 2
+  %58 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %58, i32 3
+  %59 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %59, i32 4
+  %60 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %60, i32 5
+  %61 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %61, i32 6
+  %62 = load i16, ptr %__p1.addr.i, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %62, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %63 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  %mul.i = mul <8 x i16> %54, %63
+  store <8 x i16> %mul.i, ptr %__ret.i61, align 16
+  %64 = load <8 x i16>, ptr %__ret.i61, align 16
+  %arrayidx30 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  store <8 x i16> %64, ptr %arrayidx30, align 16
+  %65 = load ptr, ptr %scale.addr, align 8
+  %cmp31 = icmp ne ptr %65, null
+  br i1 %cmp31, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx32 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %66 = load <8 x i16>, ptr %arrayidx32, align 16
+  %67 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp, ptr align 2 %67, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp.coerce, ptr align 2 %agg.tmp, i64 4, i1 false)
+  %68 = load i64, ptr %agg.tmp.coerce, align 8
+  %call33 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %66, i64 %68)
+  store <8 x i16> %call33, ptr %__s1, align 16
+  %69 = load ptr, ptr %dst.addr, align 8
+  %arrayidx34 = getelementptr inbounds %struct.cmplx_int16_t, ptr %69, i64 0
+  %70 = load <8 x i16>, ptr %__s1, align 16
+  %71 = bitcast <8 x i16> %70 to <16 x i8>
+  %72 = bitcast <16 x i8> %71 to <8 x i16>
+  store <8 x i16> %72, ptr %arrayidx34, align 2
+  %arrayidx36 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %73 = load <8 x i16>, ptr %arrayidx36, align 16
+  %74 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp37, ptr align 2 %74, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp37.coerce, ptr align 2 %agg.tmp37, i64 4, i1 false)
+  %75 = load i64, ptr %agg.tmp37.coerce, align 8
+  %call38 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %73, i64 %75)
+  store <8 x i16> %call38, ptr %__s135, align 16
+  %76 = load ptr, ptr %dst.addr, align 8
+  %arrayidx39 = getelementptr inbounds %struct.cmplx_int16_t, ptr %76, i64 4
+  %77 = load <8 x i16>, ptr %__s135, align 16
+  %78 = bitcast <8 x i16> %77 to <16 x i8>
+  %79 = bitcast <16 x i8> %78 to <8 x i16>
+  store <8 x i16> %79, ptr %arrayidx39, align 2
+  %arrayidx41 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %80 = load <8 x i16>, ptr %arrayidx41, align 16
+  %81 = load ptr, ptr %scale.addr, align 8
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %agg.tmp42, ptr align 2 %81, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %agg.tmp42.coerce, ptr align 2 %agg.tmp42, i64 4, i1 false)
+  %82 = load i64, ptr %agg.tmp42.coerce, align 8
+  %call43 = call noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %80, i64 %82)
+  store <8 x i16> %call43, ptr %__s140, align 16
+  %83 = load ptr, ptr %dst.addr, align 8
+  %arrayidx44 = getelementptr inbounds %struct.cmplx_int16_t, ptr %83, i64 8
+  %84 = load <8 x i16>, ptr %__s140, align 16
+  %85 = bitcast <8 x i16> %84 to <16 x i8>
+  %86 = bitcast <16 x i8> %85 to <8 x i16>
+  store <8 x i16> %86, ptr %arrayidx44, align 2
+  %87 = load ptr, ptr %dst.addr, align 8
+  %add.ptr = getelementptr inbounds %struct.cmplx_int16_t, ptr %87, i64 12
+  store ptr %add.ptr, ptr %dst.addr, align 8
+  br label %if.end
+
+if.else:                                          ; preds = %for.body
+  %arrayidx46 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 0
+  %88 = load <8 x i16>, ptr %arrayidx46, align 16
+  store <8 x i16> %88, ptr %__s145, align 16
+  %89 = load ptr, ptr %dst.addr, align 8
+  %90 = load <8 x i16>, ptr %__s145, align 16
+  %91 = bitcast <8 x i16> %90 to <16 x i8>
+  %92 = bitcast <16 x i8> %91 to <8 x i16>
+  store <8 x i16> %92, ptr %89, align 2
+  %93 = load ptr, ptr %dst.addr, align 8
+  %add.ptr47 = getelementptr inbounds %struct.cmplx_int16_t, ptr %93, i64 4
+  store ptr %add.ptr47, ptr %dst.addr, align 8
+  %arrayidx49 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 1
+  %94 = load <8 x i16>, ptr %arrayidx49, align 16
+  store <8 x i16> %94, ptr %__s148, align 16
+  %95 = load ptr, ptr %dst.addr, align 8
+  %96 = load <8 x i16>, ptr %__s148, align 16
+  %97 = bitcast <8 x i16> %96 to <16 x i8>
+  %98 = bitcast <16 x i8> %97 to <8 x i16>
+  store <8 x i16> %98, ptr %95, align 2
+  %99 = load ptr, ptr %dst.addr, align 8
+  %add.ptr50 = getelementptr inbounds %struct.cmplx_int16_t, ptr %99, i64 4
+  store ptr %add.ptr50, ptr %dst.addr, align 8
+  %arrayidx52 = getelementptr inbounds [3 x <8 x i16>], ptr %prb_decomp, i64 0, i64 2
+  %100 = load <8 x i16>, ptr %arrayidx52, align 16
+  store <8 x i16> %100, ptr %__s151, align 16
+  %101 = load ptr, ptr %dst.addr, align 8
+  %102 = load <8 x i16>, ptr %__s151, align 16
+  %103 = bitcast <8 x i16> %102 to <16 x i8>
+  %104 = bitcast <16 x i8> %103 to <8 x i16>
+  store <8 x i16> %104, ptr %101, align 2
+  %105 = load ptr, ptr %dst.addr, align 8
+  %add.ptr53 = getelementptr inbounds %struct.cmplx_int16_t, ptr %105, i64 4
+  store ptr %add.ptr53, ptr %dst.addr, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %106 = load i32, ptr %i, align 4
+  %inc = add i32 %106, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond, !llvm.loop !4
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0
+}
+
+define internal noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(<8 x i16> noundef %a, i64 %scale.coerce) #0 {
+; CHECK-LABEL: define internal fastcc noundef <8 x i16> @_ZL24cmplx_mul_combined_re_im11__Int16x8_t20cmplx_int16_t(
+; CHECK-SAME: <8 x i16> noundef [[A:%.*]], i64 [[SCALE_COERCE:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SCALE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_COERCE]] to i16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_SHIFT36:%.*]] = lshr i64 [[SCALE_COERCE]], 16
+; CHECK-NEXT:    [[SCALE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[SCALE_SROA_2_0_EXTRACT_SHIFT36]] to i16
+; CHECK-NEXT:    [[VECINIT_I19:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_0_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[SCALE_SROA_2_0_EXTRACT_TRUNC]], i64 0
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQNEGQ_V1_I:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VECINIT7_I]])
+; CHECK-NEXT:    [[SHUFFLE_I85:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE_I82:%.*]] = shufflevector <8 x i16> [[VECINIT_I19]], <8 x i16> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VQDMULL_V2_I72:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I85]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I97:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VQDMULL_V2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I97]], <4 x i16> [[SHUFFLE_I82]])
+; CHECK-NEXT:    [[SHUFFLE_I79:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[SHUFFLE_I76:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <4 x i32> <i32 0, i32 8, i32 2, i32 8>
+; CHECK-NEXT:    [[VQDMLAL2_I106:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I79]], <4 x i16> [[SHUFFLE_I76]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I107:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I72]], <4 x i32> [[VQDMLAL2_I106]])
+; CHECK-NEXT:    [[SHUFFLE_I91:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    [[SHUFFLE_I88:%.*]] = shufflevector <8 x i16> [[VQNEGQ_V1_I]], <8 x i16> [[VECINIT_I]], <4 x i32> <i32 4, i32 8, i32 6, i32 8>
+; CHECK-NEXT:    [[VQDMLAL2_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I91]], <4 x i16> [[SHUFFLE_I88]])
+; CHECK-NEXT:    [[VQDMLAL_V3_I:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMULL_V2_I]], <4 x i32> [[VQDMLAL2_I]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I107]] to <8 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[VQDMLAL_V3_I]] to <8 x i16>
+; CHECK-NEXT:    [[SHUFFLE_I61:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I61]]
+;
+entry:
+  %__p0.addr.i102 = alloca <4 x i32>, align 16
+  %__p1.addr.i103 = alloca <4 x i16>, align 8
+  %__p2.addr.i104 = alloca <4 x i16>, align 8
+  %__ret.i105 = alloca <4 x i32>, align 16
+  %__p0.addr.i98 = alloca <4 x i32>, align 16
+  %__p1.addr.i99 = alloca <4 x i16>, align 8
+  %__p2.addr.i100 = alloca <4 x i16>, align 8
+  %__ret.i101 = alloca <4 x i32>, align 16
+  %__p0.addr.i95 = alloca <8 x i16>, align 16
+  %__ret.i96 = alloca <4 x i16>, align 8
+  %__p0.addr.i92 = alloca <8 x i16>, align 16
+  %__ret.i93 = alloca <4 x i16>, align 8
+  %__p0.addr.i89 = alloca <8 x i16>, align 16
+  %__ret.i90 = alloca <4 x i16>, align 8
+  %__p0.addr.i86 = alloca <8 x i16>, align 16
+  %__ret.i87 = alloca <4 x i16>, align 8
+  %__p0.addr.i83 = alloca <8 x i16>, align 16
+  %__ret.i84 = alloca <4 x i16>, align 8
+  %__p0.addr.i80 = alloca <8 x i16>, align 16
+  %__ret.i81 = alloca <4 x i16>, align 8
+  %__p0.addr.i77 = alloca <8 x i16>, align 16
+  %__ret.i78 = alloca <4 x i16>, align 8
+  %__p0.addr.i74 = alloca <8 x i16>, align 16
+  %__ret.i75 = alloca <4 x i16>, align 8
+  %__p0.addr.i69 = alloca <4 x i16>, align 8
+  %__p1.addr.i70 = alloca <4 x i16>, align 8
+  %__ret.i71 = alloca <4 x i32>, align 16
+  %__p0.addr.i66 = alloca <4 x i16>, align 8
+  %__p1.addr.i67 = alloca <4 x i16>, align 8
+  %__ret.i68 = alloca <4 x i32>, align 16
+  %__p0.addr.i64 = alloca <4 x i32>, align 16
+  %__ret.i65 = alloca <8 x i16>, align 16
+  %__p0.addr.i62 = alloca <4 x i32>, align 16
+  %__ret.i63 = alloca <8 x i16>, align 16
+  %__p0.addr.i58 = alloca <8 x i16>, align 16
+  %__p1.addr.i59 = alloca <8 x i16>, align 16
+  %__ret.i60 = alloca <8 x i16>, align 16
+  %__p0.addr.i51 = alloca <4 x i32>, align 16
+  %__p1.addr.i52 = alloca <8 x i16>, align 16
+  %__p2.addr.i53 = alloca <8 x i16>, align 16
+  %__ret.i54 = alloca <4 x i32>, align 16
+  %a.addr.i46 = alloca <4 x i32>, align 16
+  %b.addr.i47 = alloca <8 x i16>, align 16
+  %c.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i40 = alloca <8 x i16>, align 16
+  %__p1.addr.i41 = alloca <8 x i16>, align 16
+  %__ret.i42 = alloca <4 x i32>, align 16
+  %a.addr.i = alloca <8 x i16>, align 16
+  %b.addr.i = alloca <8 x i16>, align 16
+  %__p0.addr.i38 = alloca <8 x i16>, align 16
+  %__ret.i39 = alloca <8 x i16>, align 16
+  %__p0.addr.i36 = alloca <8 x i16>, align 16
+  %__p1.addr.i = alloca <8 x i16>, align 16
+  %__p2.addr.i = alloca <8 x i16>, align 16
+  %__ret.i37 = alloca <8 x i16>, align 16
+  %__p0.addr.i29 = alloca i32, align 4
+  %__ret.i30 = alloca <4 x i32>, align 16
+  %.compoundliteral.i31 = alloca <4 x i32>, align 16
+  %__p0.addr.i27 = alloca <4 x i32>, align 16
+  %__ret.i28 = alloca <8 x i16>, align 16
+  %__p0.addr.i16 = alloca i16, align 2
+  %__ret.i17 = alloca <8 x i16>, align 16
+  %.compoundliteral.i18 = alloca <8 x i16>, align 16
+  %__p0.addr.i14 = alloca i16, align 2
+  %__ret.i15 = alloca <8 x i16>, align 16
+  %.compoundliteral.i = alloca <8 x i16>, align 16
+  %__p0.addr.i = alloca <8 x i16>, align 16
+  %__ret.i = alloca <8 x i16>, align 16
+  %scale = alloca %struct.cmplx_int16_t, align 2
+  %a.addr = alloca <8 x i16>, align 16
+  %a_rev = alloca <8 x i16>, align 16
+  %cc = alloca <8 x i16>, align 16
+  %dd = alloca <8 x i16>, align 16
+  %mult_mask = alloca <8 x i16>, align 16
+  %lo32 = alloca <4 x i32>, align 16
+  %hi32 = alloca <4 x i32>, align 16
+  %coerce.val.ii = trunc i64 %scale.coerce to i32
+  store i32 %coerce.val.ii, ptr %scale, align 2
+  store <8 x i16> %a, ptr %a.addr, align 16
+  %0 = load <8 x i16>, ptr %a.addr, align 16
+  store <8 x i16> %0, ptr %__p0.addr.i, align 16
+  %1 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %2 = load <8 x i16>, ptr %__p0.addr.i, align 16
+  %shuffle.i = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  store <8 x i16> %shuffle.i, ptr %__ret.i, align 16
+  %3 = load <8 x i16>, ptr %__ret.i, align 16
+  store <8 x i16> %3, ptr %a_rev, align 16
+  %re = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 0
+  %4 = load i16, ptr %re, align 2
+  store i16 %4, ptr %__p0.addr.i16, align 2
+  %5 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit.i19 = insertelement <8 x i16> poison, i16 %5, i32 0
+  %6 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit1.i20 = insertelement <8 x i16> %vecinit.i19, i16 %6, i32 1
+  %7 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit2.i21 = insertelement <8 x i16> %vecinit1.i20, i16 %7, i32 2
+  %8 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit3.i22 = insertelement <8 x i16> %vecinit2.i21, i16 %8, i32 3
+  %9 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit4.i23 = insertelement <8 x i16> %vecinit3.i22, i16 %9, i32 4
+  %10 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit5.i24 = insertelement <8 x i16> %vecinit4.i23, i16 %10, i32 5
+  %11 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit6.i25 = insertelement <8 x i16> %vecinit5.i24, i16 %11, i32 6
+  %12 = load i16, ptr %__p0.addr.i16, align 2
+  %vecinit7.i26 = insertelement <8 x i16> %vecinit6.i25, i16 %12, i32 7
+  store <8 x i16> %vecinit7.i26, ptr %.compoundliteral.i18, align 16
+  %13 = load <8 x i16>, ptr %.compoundliteral.i18, align 16
+  store <8 x i16> %13, ptr %__ret.i17, align 16
+  %14 = load <8 x i16>, ptr %__ret.i17, align 16
+  store <8 x i16> %14, ptr %cc, align 16
+  %im = getelementptr inbounds nuw %struct.cmplx_int16_t, ptr %scale, i32 0, i32 1
+  %15 = load i16, ptr %im, align 2
+  store i16 %15, ptr %__p0.addr.i14, align 2
+  %16 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit.i = insertelement <8 x i16> poison, i16 %16, i32 0
+  %17 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %17, i32 1
+  %18 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %18, i32 2
+  %19 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %19, i32 3
+  %20 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %20, i32 4
+  %21 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %21, i32 5
+  %22 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %22, i32 6
+  %23 = load i16, ptr %__p0.addr.i14, align 2
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %23, i32 7
+  store <8 x i16> %vecinit7.i, ptr %.compoundliteral.i, align 16
+  %24 = load <8 x i16>, ptr %.compoundliteral.i, align 16
+  store <8 x i16> %24, ptr %__ret.i15, align 16
+  %25 = load <8 x i16>, ptr %__ret.i15, align 16
+  store <8 x i16> %25, ptr %dd, align 16
+  store i32 65535, ptr %__p0.addr.i29, align 4
+  %26 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit.i32 = insertelement <4 x i32> poison, i32 %26, i32 0
+  %27 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit1.i33 = insertelement <4 x i32> %vecinit.i32, i32 %27, i32 1
+  %28 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit2.i34 = insertelement <4 x i32> %vecinit1.i33, i32 %28, i32 2
+  %29 = load i32, ptr %__p0.addr.i29, align 4
+  %vecinit3.i35 = insertelement <4 x i32> %vecinit2.i34, i32 %29, i32 3
+  store <4 x i32> %vecinit3.i35, ptr %.compoundliteral.i31, align 16
+  %30 = load <4 x i32>, ptr %.compoundliteral.i31, align 16
+  store <4 x i32> %30, ptr %__ret.i30, align 16
+  %31 = load <4 x i32>, ptr %__ret.i30, align 16
+  store <4 x i32> %31, ptr %__p0.addr.i27, align 16
+  %32 = load <4 x i32>, ptr %__p0.addr.i27, align 16
+  %33 = bitcast <4 x i32> %32 to <8 x i16>
+  store <8 x i16> %33, ptr %__ret.i28, align 16
+  %34 = load <8 x i16>, ptr %__ret.i28, align 16
+  store <8 x i16> %34, ptr %mult_mask, align 16
+  %35 = load <8 x i16>, ptr %mult_mask, align 16
+  %36 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %36, ptr %__p0.addr.i38, align 16
+  %37 = load <8 x i16>, ptr %__p0.addr.i38, align 16
+  %38 = bitcast <8 x i16> %37 to <16 x i8>
+  %vqnegq_v1.i = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %37)
+  %vqnegq_v2.i = bitcast <8 x i16> %vqnegq_v1.i to <16 x i8>
+  store <8 x i16> %vqnegq_v1.i, ptr %__ret.i39, align 16
+  %39 = load <8 x i16>, ptr %__ret.i39, align 16
+  %40 = load <8 x i16>, ptr %dd, align 16
+  store <8 x i16> %35, ptr %__p0.addr.i36, align 16
+  store <8 x i16> %39, ptr %__p1.addr.i, align 16
+  store <8 x i16> %40, ptr %__p2.addr.i, align 16
+  %41 = load <8 x i16>, ptr %__p0.addr.i36, align 16
+  %42 = bitcast <8 x i16> %41 to <16 x i8>
+  %43 = load <8 x i16>, ptr %__p1.addr.i, align 16
+  %44 = bitcast <8 x i16> %43 to <16 x i8>
+  %45 = load <8 x i16>, ptr %__p2.addr.i, align 16
+  %46 = bitcast <8 x i16> %45 to <16 x i8>
+  %vbsl3.i = and <8 x i16> %41, %43
+  %47 = xor <8 x i16> %41, splat (i16 -1)
+  %vbsl4.i = and <8 x i16> %47, %45
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  store <8 x i16> %vbsl5.i, ptr %__ret.i37, align 16
+  %48 = load <8 x i16>, ptr %__ret.i37, align 16
+  store <8 x i16> %48, ptr %dd, align 16
+  %49 = load <8 x i16>, ptr %a.addr, align 16
+  %50 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %49, ptr %a.addr.i, align 16
+  store <8 x i16> %50, ptr %b.addr.i, align 16
+  %51 = load <8 x i16>, ptr %a.addr.i, align 16
+  store <8 x i16> %51, ptr %__p0.addr.i83, align 16
+  %52 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %53 = load <8 x i16>, ptr %__p0.addr.i83, align 16
+  %shuffle.i85 = shufflevector <8 x i16> %52, <8 x i16> %53, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i85, ptr %__ret.i84, align 8
+  %54 = load <4 x i16>, ptr %__ret.i84, align 8
+  %55 = load <8 x i16>, ptr %b.addr.i, align 16
+  store <8 x i16> %55, ptr %__p0.addr.i80, align 16
+  %56 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %57 = load <8 x i16>, ptr %__p0.addr.i80, align 16
+  %shuffle.i82 = shufflevector <8 x i16> %56, <8 x i16> %57, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i82, ptr %__ret.i81, align 8
+  %58 = load <4 x i16>, ptr %__ret.i81, align 8
+  store <4 x i16> %54, ptr %__p0.addr.i69, align 8
+  store <4 x i16> %58, ptr %__p1.addr.i70, align 8
+  %59 = load <4 x i16>, ptr %__p0.addr.i69, align 8
+  %60 = bitcast <4 x i16> %59 to <8 x i8>
+  %61 = load <4 x i16>, ptr %__p1.addr.i70, align 8
+  %62 = bitcast <4 x i16> %61 to <8 x i8>
+  %vqdmull_v2.i72 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %59, <4 x i16> %61)
+  %vqdmull_v3.i73 = bitcast <4 x i32> %vqdmull_v2.i72 to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i72, ptr %__ret.i71, align 16
+  %63 = load <4 x i32>, ptr %__ret.i71, align 16
+  store <4 x i32> %63, ptr %lo32, align 16
+  %64 = load <8 x i16>, ptr %a.addr, align 16
+  %65 = load <8 x i16>, ptr %cc, align 16
+  store <8 x i16> %64, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %65, ptr %__p1.addr.i41, align 16
+  %66 = load <8 x i16>, ptr %__p0.addr.i40, align 16
+  store <8 x i16> %66, ptr %__p0.addr.i95, align 16
+  %67 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %68 = load <8 x i16>, ptr %__p0.addr.i95, align 16
+  %shuffle.i97 = shufflevector <8 x i16> %67, <8 x i16> %68, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i97, ptr %__ret.i96, align 8
+  %69 = load <4 x i16>, ptr %__ret.i96, align 8
+  %70 = load <8 x i16>, ptr %__p1.addr.i41, align 16
+  store <8 x i16> %70, ptr %__p0.addr.i92, align 16
+  %71 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %72 = load <8 x i16>, ptr %__p0.addr.i92, align 16
+  %shuffle.i94 = shufflevector <8 x i16> %71, <8 x i16> %72, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i94, ptr %__ret.i93, align 8
+  %73 = load <4 x i16>, ptr %__ret.i93, align 8
+  store <4 x i16> %69, ptr %__p0.addr.i66, align 8
+  store <4 x i16> %73, ptr %__p1.addr.i67, align 8
+  %74 = load <4 x i16>, ptr %__p0.addr.i66, align 8
+  %75 = bitcast <4 x i16> %74 to <8 x i8>
+  %76 = load <4 x i16>, ptr %__p1.addr.i67, align 8
+  %77 = bitcast <4 x i16> %76 to <8 x i8>
+  %vqdmull_v2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %74, <4 x i16> %76)
+  %vqdmull_v3.i = bitcast <4 x i32> %vqdmull_v2.i to <16 x i8>
+  store <4 x i32> %vqdmull_v2.i, ptr %__ret.i68, align 16
+  %78 = load <4 x i32>, ptr %__ret.i68, align 16
+  store <4 x i32> %78, ptr %__ret.i42, align 16
+  %79 = load <4 x i32>, ptr %__ret.i42, align 16
+  store <4 x i32> %79, ptr %hi32, align 16
+  %80 = load <4 x i32>, ptr %lo32, align 16
+  %81 = load <8 x i16>, ptr %a_rev, align 16
+  %82 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %80, ptr %a.addr.i46, align 16
+  store <8 x i16> %81, ptr %b.addr.i47, align 16
+  store <8 x i16> %82, ptr %c.addr.i, align 16
+  %83 = load <4 x i32>, ptr %a.addr.i46, align 16
+  %84 = load <8 x i16>, ptr %b.addr.i47, align 16
+  store <8 x i16> %84, ptr %__p0.addr.i77, align 16
+  %85 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %86 = load <8 x i16>, ptr %__p0.addr.i77, align 16
+  %shuffle.i79 = shufflevector <8 x i16> %85, <8 x i16> %86, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i79, ptr %__ret.i78, align 8
+  %87 = load <4 x i16>, ptr %__ret.i78, align 8
+  %88 = load <8 x i16>, ptr %c.addr.i, align 16
+  store <8 x i16> %88, ptr %__p0.addr.i74, align 16
+  %89 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %90 = load <8 x i16>, ptr %__p0.addr.i74, align 16
+  %shuffle.i76 = shufflevector <8 x i16> %89, <8 x i16> %90, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %shuffle.i76, ptr %__ret.i75, align 8
+  %91 = load <4 x i16>, ptr %__ret.i75, align 8
+  store <4 x i32> %83, ptr %__p0.addr.i102, align 16
+  store <4 x i16> %87, ptr %__p1.addr.i103, align 8
+  store <4 x i16> %91, ptr %__p2.addr.i104, align 8
+  %92 = load <4 x i32>, ptr %__p0.addr.i102, align 16
+  %93 = bitcast <4 x i32> %92 to <16 x i8>
+  %94 = load <4 x i16>, ptr %__p1.addr.i103, align 8
+  %95 = bitcast <4 x i16> %94 to <8 x i8>
+  %96 = load <4 x i16>, ptr %__p2.addr.i104, align 8
+  %97 = bitcast <4 x i16> %96 to <8 x i8>
+  %vqdmlal2.i106 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %94, <4 x i16> %96)
+  %vqdmlal_v3.i107 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %92, <4 x i32> %vqdmlal2.i106)
+  store <4 x i32> %vqdmlal_v3.i107, ptr %__ret.i105, align 16
+  %98 = load <4 x i32>, ptr %__ret.i105, align 16
+  store <4 x i32> %98, ptr %lo32, align 16
+  %99 = load <4 x i32>, ptr %hi32, align 16
+  %100 = load <8 x i16>, ptr %a_rev, align 16
+  %101 = load <8 x i16>, ptr %dd, align 16
+  store <4 x i32> %99, ptr %__p0.addr.i51, align 16
+  store <8 x i16> %100, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %101, ptr %__p2.addr.i53, align 16
+  %102 = load <4 x i32>, ptr %__p0.addr.i51, align 16
+  %103 = load <8 x i16>, ptr %__p1.addr.i52, align 16
+  store <8 x i16> %103, ptr %__p0.addr.i89, align 16
+  %104 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %105 = load <8 x i16>, ptr %__p0.addr.i89, align 16
+  %shuffle.i91 = shufflevector <8 x i16> %104, <8 x i16> %105, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i91, ptr %__ret.i90, align 8
+  %106 = load <4 x i16>, ptr %__ret.i90, align 8
+  %107 = load <8 x i16>, ptr %__p2.addr.i53, align 16
+  store <8 x i16> %107, ptr %__p0.addr.i86, align 16
+  %108 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %109 = load <8 x i16>, ptr %__p0.addr.i86, align 16
+  %shuffle.i88 = shufflevector <8 x i16> %108, <8 x i16> %109, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i16> %shuffle.i88, ptr %__ret.i87, align 8
+  %110 = load <4 x i16>, ptr %__ret.i87, align 8
+  store <4 x i32> %102, ptr %__p0.addr.i98, align 16
+  store <4 x i16> %106, ptr %__p1.addr.i99, align 8
+  store <4 x i16> %110, ptr %__p2.addr.i100, align 8
+  %111 = load <4 x i32>, ptr %__p0.addr.i98, align 16
+  %112 = bitcast <4 x i32> %111 to <16 x i8>
+  %113 = load <4 x i16>, ptr %__p1.addr.i99, align 8
+  %114 = bitcast <4 x i16> %113 to <8 x i8>
+  %115 = load <4 x i16>, ptr %__p2.addr.i100, align 8
+  %116 = bitcast <4 x i16> %115 to <8 x i8>
+  %vqdmlal2.i = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %113, <4 x i16> %115)
+  %vqdmlal_v3.i = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %111, <4 x i32> %vqdmlal2.i)
+  store <4 x i32> %vqdmlal_v3.i, ptr %__ret.i101, align 16
+  %117 = load <4 x i32>, ptr %__ret.i101, align 16
+  store <4 x i32> %117, ptr %__ret.i54, align 16
+  %118 = load <4 x i32>, ptr %__ret.i54, align 16
+  store <4 x i32> %118, ptr %hi32, align 16
+  %119 = load <4 x i32>, ptr %lo32, align 16
+  store <4 x i32> %119, ptr %__p0.addr.i64, align 16
+  %120 = load <4 x i32>, ptr %__p0.addr.i64, align 16
+  %121 = bitcast <4 x i32> %120 to <8 x i16>
+  store <8 x i16> %121, ptr %__ret.i65, align 16
+  %122 = load <8 x i16>, ptr %__ret.i65, align 16
+  %123 = load <4 x i32>, ptr %hi32, align 16
+  store <4 x i32> %123, ptr %__p0.addr.i62, align 16
+  %124 = load <4 x i32>, ptr %__p0.addr.i62, align 16
+  %125 = bitcast <4 x i32> %124 to <8 x i16>
+  store <8 x i16> %125, ptr %__ret.i63, align 16
+  %126 = load <8 x i16>, ptr %__ret.i63, align 16
+  store <8 x i16> %122, ptr %__p0.addr.i58, align 16
+  store <8 x i16> %126, ptr %__p1.addr.i59, align 16
+  %127 = load <8 x i16>, ptr %__p0.addr.i58, align 16
+  %128 = load <8 x i16>, ptr %__p1.addr.i59, align 16
+  %shuffle.i61 = shufflevector <8 x i16> %127, <8 x i16> %128, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i16> %shuffle.i61, ptr %__ret.i60, align 16
+  %129 = load <8 x i16>, ptr %__ret.i60, align 16
+  ret <8 x i16> %129
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+attributes #0 = { mustprogress noinline uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+bf16,+bti,+ccidx,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fullfp16,+jsconv,+lse,+neon,+pauth,+predres,+ras,+rcpc,+rdm,+sb,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a,-fmv" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"uwtable", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 1}
+!3 = !{!"clang version 20.0.0git"}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.mustprogress"}
+;.
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"}
+;.