[llvm] c4f32b4 - [AArch64] Tests for postinc scheduling write operands. NFC

David Green via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 7 13:31:20 PDT 2023


Author: David Green
Date: 2023-10-07T21:31:08+01:00
New Revision: c4f32b4bda8e536bdd539762785805dbe42f0852

URL: https://github.com/llvm/llvm-project/commit/c4f32b4bda8e536bdd539762785805dbe42f0852
DIFF: https://github.com/llvm/llvm-project/commit/c4f32b4bda8e536bdd539762785805dbe42f0852.diff

LOG: [AArch64] Tests for postinc scheduling write operands. NFC

Add similar tests to D159254 for other CPU targets.

Added: 
    llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s
    llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s
    llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
    llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
    llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s
new file mode 100644
index 000000000000000..f9b4509531d35cb
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s
@@ -0,0 +1,5285 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a510 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeE .    .    .    . .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     .  DE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .  DE.    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .   DeeE  .    . .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    . DE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .  DeeE   . .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .DE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    . DeeeE.   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2301
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeE .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .  DE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .   DeeeE .    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .  DE.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeE  .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DE .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE.   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeE .    .    .    . .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    . .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .  DeeE   . .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    . DeeeE.   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.42
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeE.   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .    .    .    .  DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeE .    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .    .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeE.    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3301
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.45
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    .DE  .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .  DE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeE    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeeE.   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2901
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeE   .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .DE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .DE  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeE.   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .  DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.56
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .    .DE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .DE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE.   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2601
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.58
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    ..   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    ..   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    ..   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    ..   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.54
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DE .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeE  .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .    .    . DE .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .  DeeeE.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3301
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.45
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeeE   .    .    .    .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .  DE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeE    .  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeeE.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    .DE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .  DE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    DE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    DE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2101
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.81
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 5.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    ..   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeE ..   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .    . DE ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeE   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1501
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeE .    .    .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1501
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.67
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeE .    .    .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeE  .    .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE.    .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeE   .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeE .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeE   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeE  .    .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE.    .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeE   .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeE .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeE   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeE  .    .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE.    .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeE   .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeE .   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeE   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeE  .    .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE.    .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeE   .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeE .   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeE   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeE  .    .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE.    .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeE   .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeE .   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeE   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.07
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeE  .    .   .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE.    .   .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeE   .   .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DE   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeE.   .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .   DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   DeeeE.   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .  DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 18.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2202
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 11.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeE    .    .    .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .  .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .    .   DE    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE .  .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    .  DE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE.   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2102
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeE.    .    .    . .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .   DE    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .  DE.    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    . DE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   . .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeE.   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1502
+# CHECK-NEXT: Total uOps:        900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total uOps:        1200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeE    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .    DE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      201
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012
+
+# CHECK:      [0,0]     DE.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     DE.   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DE   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345
+
+# CHECK:      [0,0]     DE   .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DE  .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DE .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DE.   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .  DE.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DE   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345
+
+# CHECK:      [0,0]     DE   .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DE  .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DE .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DE.   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .  DE.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DE   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345
+
+# CHECK:      [0,0]     DE   .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DE  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DE .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DE.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  DE.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DE   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345
+
+# CHECK:      [0,0]     DE   .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DE  .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DE .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DE.   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .  DE.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DE   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.99
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345
+
+# CHECK:      [0,0]     DE   .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DE  .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DE .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     . DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DE.   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  DE.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DE   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .   DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      101
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.97
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01
+
+# CHECK:      [0,0]     DE   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      401
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234
+
+# CHECK:      [0,0]     DeE .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     .DE .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeE   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     .  DE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s
new file mode 100644
index 000000000000000..c5ca6f9f1764aa0
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s
@@ -0,0 +1,5290 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a53 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3901
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .    .   .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeeE    .    .    .   .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .  DeeeeE .    .   .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .DeeeeE  .   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4301
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 13.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeE  .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    .    . DeeE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    DeeeeeE   .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeeE  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    .    .   DeeE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    . DeeeeeE .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeeE  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .   DeeE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    . DeeeeeE .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeeE  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 17.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    . DeeeeeeE.    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    . DeeeeeeE. .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 17.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeE. .   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .    .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 18.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .    DeeE .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .  DeeeeeeE    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeeeE  .   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeeeE  .   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.39
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeE.    .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .    .   DeeE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    . DeeeE   .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeE .    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .DeeeE  .   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.41
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    . .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    . .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    . .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    . .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE. .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4901
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 19.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeE   .    .    .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .DeeE.    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .   DeeeeeeE   .    .    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeE   .    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .   DeeeeeeE  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4601
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeE    .    ..   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeE ..   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5101
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 21.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeeeE    .    .    .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeE   .    .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeeE  .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeeE ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: Block RThroughput: 24.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeE .    .    .    .    .    .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    .  DeeE   .    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    .DeeeeeeeE.    .    .    .    .    .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .    .   DeeE  .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    . DeeeeeeeE    .    .    .    .   .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeeE    .    .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    DeeE .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .  DeeeeeeeE  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: Block RThroughput: 25.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeE .    .    .    .    .    .    .    .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .  DeeE   .    .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    .DeeeeeeeE.    .    .    .    .    .    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .   DeeE  .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    . DeeeeeeeE    .    .    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeeeE   .    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .DeeE.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .   DeeeeeeeE  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3201
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.53
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    . .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    . .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeE . .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeE.    .    .    . .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeE.    . .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    DeeeE. .   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    .   .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE  .   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2603
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.58
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .  .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .  .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .  .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE  .   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .DeeeE    .   .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    DeeE .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .DeeeeE  .   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2803
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.54
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    .    .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .    .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeE  .   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DeeE .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .DeeeeE   .    .   .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE  .   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        900
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.45
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeE    .    .    . .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    . DeeE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .  DeeeeeE. .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3403
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    ..   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeE ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE. .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3103
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: Block RThroughput: 11.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .  DeeeeE .    .    .  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeE.    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .   DeeE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    DeeeeE  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3103
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: Block RThroughput: 11.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .  .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    .  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE  .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE. .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3403
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .  DeeeeeE.    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeeE   .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    . DeeeeeE ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3203
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .   .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeE.    .    .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeE    .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeE  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total uOps:        1200
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      801
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeE.  .   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     .DeeE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      401
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234
+
+# CHECK:      [0,0]     DeeeE   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      801
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeE.  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s
new file mode 100644
index 000000000000000..76f46ccf0c5cb4a
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s
@@ -0,0 +1,5290 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.41
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    . .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .    . .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeE.    .    .    . .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeE   .    . .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .   DeeeeE. .   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.39
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeeE    .    .    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .  DeeeE  .    .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    DeeeeE  .   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.41
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    . .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    . .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeE.    .    .    . .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeE   .    . .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .   DeeeeE. .   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    . .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    . .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    . .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    . .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeE. .   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4601
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    ..   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .    ..   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    ..   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .    DeeE .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .  DeeeeeeE    .    ..   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeE ..   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .   .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .   .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeE .    .    .    .   .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeeeE .    .   .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeE  .   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeE   .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeeeE .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .  DeeE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .DeeeeeE  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: Block RThroughput: 24.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    .   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeE  .    .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeeeE.    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    . DeeeeeeeeE  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5101
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 21.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .    ..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeE  .    .    .    .    ..   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeE   .    .    ..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .DeeE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .   DeeeeeeeeE ..   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5701
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.26
+# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: Block RThroughput: 27.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeeeeE    .    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeeE    .    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .  DeeeeeeeeeeE. .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.26
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 28.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeeeeeE   .    .    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .DeeE.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .   DeeeeeeE   .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeE  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.26
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 28.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeE   .    .    .    .    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .    .    .DeeE.    .    .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    .   DeeeeeeE   .    .    .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeeeeeE    .    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeE .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .  DeeeeeeE    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    DeeE .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .  DeeeeeeE  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5901
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 29.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeE   .    .    .    .    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .    .DeeE.    .    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    .   DeeeeeeE   .    .    .    .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeeeeeE    .    .    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .  DeeeeeeeeeeE.    .   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    .   DeeE  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .    . DeeeE  .   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .    .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4801
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 18.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .    DeeE .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .  DeeeeeeE    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeeeE  .   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeeeE  .   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeE .    .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeE. .   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeeeeeE  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5101
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 21.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    .    ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeeE  .    .    .    .    ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeE   .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .DeeE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .   DeeeeeeeeE ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5401
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: Block RThroughput: 24.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeE.    .    .    .    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .    .   DeeE  .    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    . DeeeeeeeeE   .    .    .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeE    .    .    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeE.    .    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    . DeeeeeeeeE  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4901
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 19.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeeeeE    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeE .    .   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .DeeeeE  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5601
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: Block RThroughput: 26.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeeeeeeeE.    .    .    .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .    .   DeeE  .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    . DeeeeeeE.    .    .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .   DeeE  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    . DeeeeeeE.    .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .   DeeE  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    . DeeeeeeeeeeE ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      6201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.24
+# CHECK-NEXT: IPC:               0.16
+# CHECK-NEXT: Block RThroughput: 32.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeeeeeE   .    .    .    .    .    .    .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeeeeeE    .    .    .    .    . .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .  DeeeeeeeeeeE.    .    . .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .    . DeeeeeeE. .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      6201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.24
+# CHECK-NEXT: IPC:               0.16
+# CHECK-NEXT: Block RThroughput: 32.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .    .    . .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    DeeeeeeeeeeE   .    .    .    .    .    .    .    . .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeE   .    .    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .DeeE.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .   DeeeeeeeeeeE    .    .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeE. .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3301
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.51
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeE .  .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeE   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2601
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.77
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    ..   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeE    .    .    ..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeeE   ..   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    .    .    .  DeeE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .DeeeE.   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeE .    .    .    .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeE .    .    .    . .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    . .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    . .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeE.    . .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    DeeeE. .   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 18.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 11.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    .   .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE  .   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2603
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.58
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .  .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .  .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .  .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE  .   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        900
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeeE    .    .    .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 20.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total uOps:        1200
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.49
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DE   ..   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     .DeeE..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DE ..   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .  DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DE   .    . .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DE .    . .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DE    . .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DE  . .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    . DeeE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DE. .   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DE   .    . .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DE .    . .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DE    . .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DE  . .   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DeeE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DE. .   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DE   .    . .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DE .    . .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DE    . .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DE  . .   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DeeE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DE. .   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DE   .    . .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DE .    . .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DE    . .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DE  . .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DeeE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DE. .   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DE   .    . .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DE .    . .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DE    . .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DE  . .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DeeE .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DE. .   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      203
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.48
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234
+
+# CHECK:      [0,0]     DE  .   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      801
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeE .  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE.   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     .    DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     0.0    0.0    0.0       <total>

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s
new file mode 100644
index 000000000000000..6993401ffa259a2
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s
@@ -0,0 +1,5289 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a57 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2604
+# CHECK-NEXT: Total uOps:        1600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.61
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .   .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .   .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeeER .    .   .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D=============eER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=============eeeeeER .   .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .   D=================eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=================eeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.7   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total uOps:        1800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.64
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    ..   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    ..   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     . D=========eER.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D========eeeeeeER.    .    ..   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .   D=============eER    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D============eeeeeeER    ..   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .D=================eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=================eeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    . D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.7   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=========eER.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=========eeeeeER .    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .  D=============eER.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D============eeeeeeER.    .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    D=================eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D=================eeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.8   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3104
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: Block RThroughput: 11.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeER.    .    .    .   .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=========eER    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D========eeeeeeER    .    .   .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    D=============eER   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D============eeeeeeeER  .   .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . D==================eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .   D======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.8   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total uOps:        2300
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.70
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 13.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    ..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    ..   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .  D==========eER   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=========eeeeeeER   .    .    ..   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    D==============eER  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=============eeeeeeeER .    ..   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . D===================eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==================eeeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   D========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.6   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.69
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==========eER   .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=========eeeeeeER   .    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D==============eER  .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=============eeeeeeER  .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==================eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=================eeeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.3   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==========eER   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=========eeeeeeeER  .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D===============eER .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==============eeeeeeER .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D===================eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==================eeeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.9   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeER.    .    .    .    . .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  D=========eER    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D========eeeeeeeeER  .    .    . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    D===============eER .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==============eeeeeeER .    . .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D===================eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==================eeeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeER   .    .    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER.    .    . .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D=================eER    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D================eeeeeeER    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=====================eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====================eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.3   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeER   .    .    .    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER.    .    .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D=================eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D================eeeeeeeeER  .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=======================eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D======================eeeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .   D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.1   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.51
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 6.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D=========================eeeeeER.   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .  D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.4   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total uOps:        1800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.53
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    . .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .    . .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeeeeER    .    .    . .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .  D===============eER   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D==============eeeeeeeeER .    . .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    D=====================eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D====================eeeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    . D===========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.3   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.51
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 6.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D=========================eeeeeER.   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .  D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.4   0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.53
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    ..   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeER.    .    .    ..   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D=================eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D================eeeeeeeeER  .    ..   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    . D=======================eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D======================eeeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .   .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeeER .    .    .   .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====================eeeeeeeeeER  .   .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D============================eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D===========================eeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.0   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .    .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeeER  .    .    .    .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D====================eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .    .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==========================eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.1   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 8.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .   .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeER  .    .    .   .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    D====================eER .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .   .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==========================eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.4   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    ..   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    ..   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    ..   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.3   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeeER   .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER.    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D=================eER    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D=================eeeeeER.    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .D=====================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D====================eeeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .  D===========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.2   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeER    .    .    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     . D===========eER   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D==========eeeeeeeeER .    .    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .   D=================eER.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D================eeeeeeeeER   .    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .D=======================eER  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D======================eeeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .  D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.1   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 8.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4704
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.77
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeeER    .    .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .D======================eER   .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=====================eeeeeeeeeER.    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .  D=============================eER    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D============================eeeeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.78
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeER  .    .    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     . D========eER .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D=======eeeeeeeeER    .    .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D==============eER   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D=============eeeeeeeeeER.    .    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D=====================eER    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D====================eeeeeeeeeER .    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D============================eER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D===========================eeeeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D===================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.3   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4704
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.77
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeeER   .    .    .    .    .    .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D===============eER  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D==============eeeeeeeeeeER   .    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . D======================eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=====================eeeeeeeeeER    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .   D=============================eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D============================eeeeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeER.    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    D======================eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=====================eeeeeeeeeER .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    . D=============================eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D============================eeeeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeER  .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D====================eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===================eeeeeeER .    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    . D========================eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=======================eeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.6   0.1    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeeeER  .    .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .   D===========eER .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D==========eeeeeeeeER    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .D=================eER   .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D================eeeeeeeeER .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .  D=======================eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D======================eeeeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.8   0.1    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 9.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeeER.    .    .    .    .   .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeeER .    .    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====================eeeeeeER.    .   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.1   0.1    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.64
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeeER  .    .    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D====================eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==========================eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.1   0.1    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.74
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 13.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .    .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeeER .    .    .    .   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====================eeeeeeeeeER  .    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D============================eER .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D===========================eeeeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .D===================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.3   0.1    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4804
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.79
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeeeER  .    .    .    .    .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    D===============eER .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D==============eeeeeeeeeeeER .    .    .    ..   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .   D======================eER.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D=====================eeeeeeeeER   .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .D============================eER  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D===========================eeeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  D===================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.7   0.1    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      5104
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.82
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeeeER  .    .    .    .    .    .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    D===============eER .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D==============eeeeeeeeeER   .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . D======================eER  .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=====================eeeeeeeeeeeER  .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .D=============================eER .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D============================eeeeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4504
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.78
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 11.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     . D=======eER  .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    D=============eER   .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D============eeeeeeeeeER.    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  D===================eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D==================eeeeeeeeeER .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .D=========================eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D========================eeeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.72
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 10.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    ..   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     . D=======eER  .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .    ..   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    D=============eER   .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D============eeeeeeeeeER.    .    .    ..   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  D===================eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D==================eeeeeeeeER  .    ..   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    D=========================eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .D========================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    . D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.76
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 10.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     . D=======eER  .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .   .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    D=============eER   .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D============eeeeeeER   .    .    .   .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    . D=================eER  .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D================eeeeeeeeeER    .   .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    D=======================eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .D======================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    . D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.72
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 10.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeeER.    .    .    .    .    ..   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .   D=============eER    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D============eeeeeeeeER  .    .    .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .D===================eER .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D==================eeeeeeeeeER   .    ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .   D=========================eER  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D========================eeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    . D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.72
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 9.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeeeER  .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D===========eER .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D==========eeeeeeeeER    .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D=================eER   .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D================eeeeeeeeER .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D=======================eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D======================eeeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.7   0.1    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeeER.    .    .    .   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D=============eER    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D============eeeeeeeeeER .    .   .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . D===================eER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D==================eeeeeER .   .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .   D======================eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D=====================eeeeeER.   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     22.0   0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.3   0.1    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2306
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 7.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .  .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeER .    .    .  .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  D========eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=======eeeeeER .    .  .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    D===========eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==========eeeeeeER.  .   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    . D===============eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==============eeeeER   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .   D==============eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     11.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     15.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.8    0.1    0.2       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1307
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.84
+# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .   .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .   .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeE--R .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER    .   .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeE--R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeER .   .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .    . D====eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D===eeeeeER.   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .    .   D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.4    0.4    0.6       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      604
+# CHECK-NEXT: Total uOps:        1600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.65
+# CHECK-NEXT: IPC:               1.66
+# CHECK-NEXT: Block RThroughput: 5.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER  .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER.   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeE--R.   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=eE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER.   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    2.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.9       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.    .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER   .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .  D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eER  .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .   D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    DeeER.   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.6    0.1    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eER    .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeER  .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .   D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D=eER .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D=eER.   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.1    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      904
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.10
+# CHECK-NEXT: IPC:               1.11
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eER    . .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeER  . .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .   D=eER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeER. .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeER.   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .  D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.6    0.1    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1404
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.71
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .D===eER  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeER .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  D===eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeER    . .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    D===eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeER. .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D=====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.9    0.1    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.62
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .   .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .D===eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeeeER    .   .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .  D=====eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D====eeER  .   .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D=====eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====eeeeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D======eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.3    0.1    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.62
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .   .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=eER    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .   .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===eER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeER    .   .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D===eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeER.   .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=====eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeeER.   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.7    0.1    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.47
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 21.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .D==eER   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeeeeER   .    .   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .   D=====eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D====eeeER.    .   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .D======eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=====eeeER  .   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .  D=======eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D======eeeeeeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .D==========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.6    0.1    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2404
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.41
+# CHECK-NEXT: IPC:               0.42
+# CHECK-NEXT: Block RThroughput: 24.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeER    .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .D==eER   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeeeeER   .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .   D=====eER  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D====eeeeeeER  .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    . D========eER .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=======eeeER    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .   D=========eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D========eeeeeeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    . D============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.6    0.1    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.47
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 21.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D==eER   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .    .   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeER.    .   .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D======eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=====eeeER  .   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D=======eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D======eeeeeeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D==========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2604
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.38
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 26.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .   .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . D====eER.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D===eeeeER  .    .    .   .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .   D======eER .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D=====eeeeeeeeER    .   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .  D==========eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D=========eeeeER.   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    D============eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .D===========eeeeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    . D==============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     12.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.4    0.1    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 32.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .  D=====eER   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    .    .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    D=======eER    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D======eeeeeeeeER  .    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .   D===========eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D==========eeeeeeeeER    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .  D===============eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   D==============eeeeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    D=================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     11.0   0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.9    0.1    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.36
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 28.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  D=====eER   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    D=======eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D======eeeeER .    .    ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . D=========eER.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D========eeeeeeeeER   ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .D=============eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D============eeeeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  D===============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     13.0   0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.9    0.1    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1904
+# CHECK-NEXT: Total uOps:        2900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.52
+# CHECK-NEXT: IPC:               0.53
+# CHECK-NEXT: Block RThroughput: 19.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  D=====eER   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeeeeeER . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . D=========eER. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . D=========eER. .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .  D=========eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=========eER .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .   D=========eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D=========eER.   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.2    0.1    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER  .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeER .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeER.   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1204
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.66
+# CHECK-NEXT: IPC:               0.83
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .    .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eER.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.    .    .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeER  .    .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .  D==eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D=eeeER    .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    D===eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D==eeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    . D=====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.1    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.82
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeER    .    .    .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .D==eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .    .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .  D===eER.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeER  .    .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .D====eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D===eeeER    .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .  D=====eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D====eeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.77
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 16.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeER   .    .    ..   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     . D==eER  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D=eeeeER    .    ..   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D====eER   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D===eeeER .    ..   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D=====eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D====eeeER   ..   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D======eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D=====eeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     6.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.7    0.1    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total uOps:        2900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.70
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeER    .    .    .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D==eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeeER.    .    .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D===eER    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D==eeeeER .    .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . D====eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D===eeeER   .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .   D=====eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D====eeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D==eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  D===eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeER   .  .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===eeeER.  .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    . D=====eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.53
+# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    ..   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .D==eER   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    ..   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  D===eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeER   ..   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D====eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===eeER ..   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . D====eER..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D===eeER.   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.6    0.1    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1204
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.83
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     . D====eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D===eeeER   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    D====eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D===eeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .  D====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.2    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total uOps:        5100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.89
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 27.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .  D===eER.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D==eeeER   .    .    .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .D===eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . D==eeeeeeER  .    .    .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    D=====eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .D====eeeeeeER .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .   D=======eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    D======eeeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . D==========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.14
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 21.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . D=eER   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeER .    .    .   .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    D=eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeER.    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   D===eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D==eeeER  .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    . D===eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .  D==eeeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .D=====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.55
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeER .    .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  D===eER.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D==eeeER   .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    D====eER  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D===eeeER.    ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    . D=====eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D====eeeER  ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .   D======eER ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D=====eeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.9    0.1    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.40
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .D==eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .  D===eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeER   .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===eeeER.  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=====eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        3300
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.57
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 19.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .   .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D==eER   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .    .   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .  D===eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeER   .    .   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D====eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===eeeeeeeeER.   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .   D========eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D=======eeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    . D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.9    0.1    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        5800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.81
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 32.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     . D==eER  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D=eeeeeeeeER.    .    .    .    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    . D=====eER    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  D====eeeeER .    .    .    .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    D======eER.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .D=====eeeeeeeeER   .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    D=========eER  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .D========eeeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    D============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.2    0.1    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.71
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 28.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  D=====eER   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D======eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . D=====eeeeER .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   D=======eER.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D======eeeeeeeeER   ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .   D==========eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    D=========eeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . D===========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.3    0.1    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 19.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   D====eER   .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    D===eeeeeeeeER .    .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   D=======eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    D======eeeER   .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .D========eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . D=======eeeER.  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .  D=========eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   D========eeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    D==========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.2    0.1    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .D==eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  D===eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeER   .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===eeeER.  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=====eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      1404
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.57
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    . .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .D===eER  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeeeER    . .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D====eeeER . .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    D======eER. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=====eeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.0    0.1    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        900
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.23
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     .DeER..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .  DeER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1207
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.32
+# CHECK-NEXT: IPC:               0.83
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    ..   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     . DeE-R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D=eER  .    ..   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .   D=eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D=eeER    ..   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    .D=eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=eeeeER..   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    .   DeE--R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D==eER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .    .D==eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    3.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.9    0.3       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeER    . .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeER. .   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeER.   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.09
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeER    . .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeER. .   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeER.   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.09
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeER    . .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeER . .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeER. .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeER    . .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeER  . .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeER. .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      203
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    1.97
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234
+
+# CHECK:      [0,0]     DeER.   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      210
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    2.86
+# CHECK-NEXT: IPC:               1.90
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeER   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     .DeE------R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.3    2.0       <total>

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
new file mode 100644
index 000000000000000..b4e7a4e40ba7721
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
@@ -0,0 +1,5290 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n1 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.76
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=========eeeeeER  .    .  .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==================eeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D=======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=========eeeeeER  .    .  .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==================eeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D=======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=========eeeeeER  .    .  .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==================eeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D=======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total uOps:        2300
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.82
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    ..   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    ..   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==========eER .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=========eeeeeeER .    .    ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .D===============eER.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D==============eeeeeeER.    ..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     . D====================eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .  D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.2   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.90
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 8.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3104
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.90
+# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: Block RThroughput: 8.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D================eER   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====================eER  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .   D===========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.1   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.74
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D===================eeeeeeeER    .  .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=========================eeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.3   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.89
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .   .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .   .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.6   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeER .    ..   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .  D============================eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===========================eeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .   D==================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     35.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.6   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.05
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 9.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     . D===============eER    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D==============eeeeeeeeER  .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .   D=====================eER .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D====================eeeeeeeeER    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .D===========================eER   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D==========================eeeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .  D===================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.1   0.1    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.04
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeeeER    .    .    .    .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeeER    .    .    .   .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    D=======================eER   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D======================eeeeeeeeeeER    .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D===============================eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==============================eeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     38.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.5   0.1    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.04
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeeeER    .    .    .    .    .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeER .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====================eeeeeeeeeeER .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=============================eER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D============================eeeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     38.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.7   0.1    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        5000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        5000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        5000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        5000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        5000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    . .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    . .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    . .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D===================eER  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D===================eeeeeER   . .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    D========================eER  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=======================eeeeeER.   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.1   0.1    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2506
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.80
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D============eER    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeER.    .    .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D================eER    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D================eeeeeeeER   .   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .D=======================eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeER   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     . D=======================eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     24.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.7   0.1    0.2       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.53
+# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    ..   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eE--R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .    ..   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eE--R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .    ..   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE--R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER    ..   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .D=======eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeeeeER.   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     . D===========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.3    0.1    0.6       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeE-R .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .D====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D====eeER.   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D=====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.3    0.1    0.7       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.09
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.69
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.79
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.79
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.17
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeER    .    .    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D===eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D==eeeeER.    .    .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .D======eER    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=====eeeER  .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     . D========eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=======eeeER    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==========eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=========eeeeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D=============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.3    0.1    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.11
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeER    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D===eER   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D==eeeeER.    .    ..   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D======eER    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=====eeeeER .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     . D=========eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D========eeeER   ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===========eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==========eeeeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.8    0.1    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.17
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeER    .    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D===eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D==eeeER .    .    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=====eER.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D====eeeeER  .    .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D========eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=======eeeER    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==========eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=========eeeeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.1    0.1    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.39
+# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeER .    .  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=====eER.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D====eeeeeER .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .  D========eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D========eeER  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==========eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=========eeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.2    0.1    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1904
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.21
+# CHECK-NEXT: IPC:               0.53
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .D====eER .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeER.    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D======eER    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=====eeeeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .  D=========eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D========eeeeeER. .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D============eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D============eeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    D==============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     13.0   0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.4    0.1    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.37
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D====eER .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeER.    .   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D======eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=====eeER   .   .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=======eER  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D======eeeeeER  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .   D==========eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==========eeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.4    0.1    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.45
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D====eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D===eeeeeER  .    .    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=======eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D=======eeeeER   .    .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .  D===========eER  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D==========eeeeER    .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .   D==============eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============eeeeER.   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     11.0   0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.8    0.1    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.05
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D====eER  .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .   .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .   .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .D===========eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .   .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.3   0.1    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.09
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     D====eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .    .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     D========eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeeER    .    .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .D============eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D===========eeeeER.    .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     . D===============eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D==============eeeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .  D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.4   0.1    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2304
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.13
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    ..   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=========eER .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeER   .    ..   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D============eER  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D============eeeeER    ..   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     . D================eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===============eeeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.1   0.1    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.09
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeeER   .    .    .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D========eER  .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=======eeeeeER   .    .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D============eER  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===========eeeeER    .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .  D===============eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===============eeeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .  D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.4   0.1    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.25
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     D======eER.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeER .   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .D==========eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=========eeeeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     . D==============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     8.3    0.2    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.31
+# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .D==========eER.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=========eeeeeeER.    .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     . D===============eER    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==============eeeeeeER    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D====================eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===================eeeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.3   0.1    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.26
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeeER .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==============eER.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=============eeeeeER .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==================eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=================eeeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.1   0.1    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.81
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeER   .    .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     . D============eER  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===========eeeeER    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D===============eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============eeeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.8   0.1    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .D=======eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D======eeeeER.    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=========eeeeER .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============eeeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .   D================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     13.0   0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.0    0.1    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.55
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeeER   .    .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .D========eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=======eeeeeER   .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D============eER  .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===========eeeeeeER  .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .   D================eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D===============eeeeeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    D======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.8   0.1    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total uOps:        5800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.41
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeeeER .    .    .    .    .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     . D==============eER.    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D=============eeeeeeeER    .    .    .   .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D====================eER   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D===================eeeeeeeeeER.    .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D===========================eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D==========================eeeeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    . D==================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     35.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.6   0.1    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 9.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    .   .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     . D===========eER   .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D==========eeeeeeeER  .    .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .  D=================eER .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D================eeeeeeeeeER   .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    D========================eER  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=======================eeeeeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.0   0.1    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total uOps:        5200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.57
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 10.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eER   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==============eeeeeER    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .   D===================eER   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D==================eeeeeER    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    D=======================eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D======================eeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .D===========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.3   0.1    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.60
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    1.77
+# CHECK-NEXT: IPC:               0.44
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeER  .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D=====eER .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=========eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeER   ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     . D============eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===========eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D===============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.0    0.1    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      404
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.48
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeER. .   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER.   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.3    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1104
+# CHECK-NEXT: Total uOps:        3100
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.81
+# CHECK-NEXT: IPC:               0.91
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .   .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D===eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D==eeER  .   .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D====eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===eeER.   .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=====eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D====eeeER .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .  D=======eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D======eER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .   D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.1    0.1    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        2300
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    3.27
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.    .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER    .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER  .   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D==eeER.   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .  D====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.1    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D====eeER  .   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.0    0.1    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    2.74
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    ..   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D==eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   ..   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D====eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER ..   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D=====eER..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D====eER..   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     . D=====eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D=====eER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     . D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     6.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.6    0.1    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      104
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    3.85
+# CHECK-NEXT: IPC:               1.92
+# CHECK-NEXT: Block RThroughput: 0.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234
+
+# CHECK:      [0,0]     DeER.   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.5    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      142
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    8
+# CHECK-NEXT: uOps Per Cycle:    4.23
+# CHECK-NEXT: IPC:               2.82
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     D=eE------R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.3    2.0       <total>

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
new file mode 100644
index 000000000000000..7ee97a088ecba18
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -0,0 +1,5347 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n2 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st2g  x26, [x27], #4064
+add x0, x27, 1
+st2g  x26, [x27, #4064]!
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+stg  x26, [x27], #4064
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stg  x26, [x27, #4064]!
+add x0, x27, 1
+stgp  x1, x2, [x27], #992
+add x0, x27, 1
+stgp  x1, x2, [x27, #992]!
+add x0, x27, 1
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+stz2g  x26, [x27], #4064
+add x0, x27, 1
+stz2g  x26, [x27, #4064]!
+add x0, x27, 1
+stzg  x26, [x27], #4064
+add x0, x27, 1
+stzg  x26, [x27, #4064]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2300
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.77
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.84
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 5.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=================eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeeER  .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     . D=======================eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D=======================eeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     . D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.86
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.78
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 5.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===================eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=========================eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=========================eeeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .  D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.8   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2900
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.72
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D======================eeeeeeeeER.    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=============================eeeeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D=====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     30.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     38.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.7   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.0   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.70
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total uOps:        4600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.09
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 9.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===============eeeeeeeeER  .    .    .    .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=======================eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D======================eeeeeeeeER    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==============================eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=============================eeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     30.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.8   0.1    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===============eeeeeeeeeER .    .    .    ..   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     . D========================eER.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=======================eeeeeeeeeER  .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D================================eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===============================eeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     33.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.5   0.1    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===============eeeeeeeeER  .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=======================eER .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D======================eeeeeeeeeER   .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===============================eER  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============================eeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.15
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.20
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.17
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total uOps:        3900
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.08
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .   .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeER  .   .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .  D===========================eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===========================eeeeeeER.   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .  D=================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.4   0.1    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2506
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D=================eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D================eeeeeeER   .   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     . D======================eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeER   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .  D======================eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     23.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.2   0.1    0.2       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.53
+# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    ..   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eE--R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .    ..   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eE--R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .    ..   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE--R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER    ..   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .D=======eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeeeeER.   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     . D===========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.3    0.1    0.6       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeE-R .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .D====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D====eeER.   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    0.1    0.7       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D====eeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.8    0.1    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.18
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.78
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeER    .   .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     . D======eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D======eeeeER.   .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     . D==========eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=========eeeeER.   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .  D=============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.2    0.1    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.10
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.20
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.30
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeER    .    .  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=======eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D==========eeeeER .  .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     11.0   0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.8    0.1    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.20
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.55
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeER .    .    .    ..   st2g	x26, [x27], #4064
+# CHECK-NEXT: [0,1]     D=eER.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.    .    .    ..   st2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,3]     D==eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER    .    ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,5]     .D=======eER   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D======eeeeeER    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,7]     . D===========eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D==========eeeeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,9]     .  D===============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2g	x26, [x27], #4064
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2g	x26, [x27, #4064]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 7.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 9.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.4    0.1    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.31
+# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .D==========eER.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=========eeeeeeER.    .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     . D===============eER    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==============eeeeeeER    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D====================eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===================eeeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.3   0.1    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.26
+# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D========eeeeeeER .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==============eER.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=============eeeeeER .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==================eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=================eeeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.1   0.1    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.45
+# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D================eER   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeER    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D====================eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===================eeeeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.7   0.1    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total uOps:        5800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.76
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeeER    .    .    .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     . D===========eER   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D==========eeeeeeER   .    .    ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D================eER  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D===============eeeeeeeER .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D=====================eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D====================eeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    . D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.60
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 9.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D====eeeeeeER .    .    .    .  .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==========eER.    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=========eeeeeeER.    .    .  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eER    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==============eeeeeeeER   .  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .   D====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D===================eeeeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    D=========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.1   0.1    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        5200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.62
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 10.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D======eER    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .    .    .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeER  .    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .   D================eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D===============eeeeeeER .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    D=====================eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D====================eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    1.71
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .   .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D======eER.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .   .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eER    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==========eeeeER .   .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     . D==============eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=============eeeeER .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=================eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D================eER.   stg	x26, [x27], #4064
+# CHECK-NEXT: [0,9]     .   D=================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       stg	x26, [x27], #4064
+# CHECK-NEXT: 9.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.9   0.1    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.37
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   stg	x26, [x27, #4064]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   stgp	x1, x2, [x27], #992
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stg	x26, [x27, #4064]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stgp	x1, x2, [x27], #992
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.55
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D==eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D===eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D==eeER  .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D===eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===eeER .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     . D====eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D====eER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     . D=====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.8    0.1    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.37
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeER.  .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2300
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.56
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeER.  .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   stz2g	x26, [x27], #4064
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   stzg	x26, [x27], #4064
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   stzg	x26, [x27, #4064]!
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stz2g	x26, [x27], #4064
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stzg	x26, [x27], #4064
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stzg	x26, [x27, #4064]!
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      110
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.45
+# CHECK-NEXT: IPC:               3.64
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     D=eE------R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.3    2.0       <total>

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s
new file mode 100644
index 000000000000000..91adecac8d67e21
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-writeback.s
@@ -0,0 +1,5290 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1  { v1.2d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2s }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4h }, [x27], #8
+add x0, x27, 1
+ld1  { v1.4s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+add x0, x27, 1
+ld1  { v1.8h }, [x27], #16
+add x0, x27, 1
+ld1  { v1.16b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+ld1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+ld1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+ld1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2d }, [x27], #8
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.8h }, [x27], #2
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], #1
+add x0, x27, 1
+ld1r  { v1.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.2s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.4s }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+add x0, x27, 1
+ld1r  { v1.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+ld2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+ld2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+ld2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+ld2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+ld2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+ld2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+ld2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], #4
+add x0, x27, 1
+ld2r  { v1.4s, v2.4s }, [x27], #8
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+add x0, x27, 1
+ld2r  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+ld2r  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+add x0, x27, 1
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+add x0, x27, 1
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+add x0, x27, 1
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+add x0, x27, 1
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+ldp  s1, s2, [x27], #248
+add x0, x27, 1
+ldp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+add x0, x27, 1
+ldp  s1, s2, [x27, #248]!
+add x0, x27, 1
+ldp  d1, d2, [x27, #496]!
+add x0, x27, 1
+ldp  q1, q2, [x27, #992]!
+add x0, x27, 1
+ldp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+add x0, x27, 1
+ldp  w1, w2, [x27, #248]!
+add x0, x27, 1
+ldp  x1, x2, [x27, #496]!
+add x0, x27, 1
+ldpsw  x1, x2, [x27], #248
+add x0, x27, 1
+ldpsw  x1, x2, [x27, #248]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+add x0, x27, 1
+ldr  h1, [x27], #254
+add x0, x27, 1
+ldr  s1, [x27], #254
+add x0, x27, 1
+ldr  d1, [x27], #254
+add x0, x27, 1
+ldr  q1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+add x0, x27, 1
+ldr  h1, [x27, #254]!
+add x0, x27, 1
+ldr  s1, [x27, #254]!
+add x0, x27, 1
+ldr  d1, [x27, #254]!
+add x0, x27, 1
+ldr  q1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+add x0, x27, 1
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  w1, [x27, #254]!
+add x0, x27, 1
+ldr  x1, [x27, #254]!
+add x0, x27, 1
+ldrb  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+add x0, x27, 1
+ldrh  w1, [x27], #254
+add x0, x27, 1
+ldrh  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  w1, [x27], #254
+add x0, x27, 1
+ldrsb  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+add x0, x27, 1
+ldrsb  x1, [x27, #254]!
+add x0, x27, 1
+ldrsh  w1, [x27], #254
+add x0, x27, 1
+ldrsh  x1, [x27], #254
+add x0, x27, 1
+ldrsh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+add x0, x27, 1
+ldrsw  x1, [x27], #254
+add x0, x27, 1
+ldrsw  x1, [x27, #254]!
+add x0, x27, 1
+st1  { v1.1d }, [x27], #8
+add x0, x27, 1
+st1  { v1.2d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+add x0, x27, 1
+st1  { v1.4h }, [x27], #8
+add x0, x27, 1
+st1  { v1.4s }, [x27], #16
+add x0, x27, 1
+st1  { v1.8b }, [x27], #8
+add x0, x27, 1
+st1  { v1.8h }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+add x0, x27, 1
+st1  { v1.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st1  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st1  { v1.1d, v2.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+add x0, x27, 1
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st1  { v1.b }[0], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[8], [x27], #1
+add x0, x27, 1
+st1  { v1.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[0], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[4], [x27], #2
+add x0, x27, 1
+st1  { v1.h }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+add x0, x27, 1
+st1  { v1.s }[0], [x27], x28
+add x0, x27, 1
+st1  { v1.d }[0], [x27], #8
+add x0, x27, 1
+st1  { v1.d }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], #16
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], #32
+add x0, x27, 1
+st2  { v1.8b, v2.8b }, [x27], #16
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+add x0, x27, 1
+st2  { v1.2d, v2.2d }, [x27], x28
+add x0, x27, 1
+st2  { v1.2s, v2.2s }, [x27], x28
+add x0, x27, 1
+st2  { v1.4h, v2.4h }, [x27], x28
+add x0, x27, 1
+st2  { v1.4s, v2.4s }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+add x0, x27, 1
+st2  { v1.8h, v2.8h }, [x27], x28
+add x0, x27, 1
+st2  { v1.16b, v2.16b }, [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[0], [x27], #2
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], #2
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.b, v2.b }[8], [x27], x28
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[4], [x27], #4
+add x0, x27, 1
+st2  { v1.h, v2.h }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], #8
+add x0, x27, 1
+st2  { v1.s, v2.s }[0], [x27], x28
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], #16
+add x0, x27, 1
+st2  { v1.d, v2.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+add x0, x27, 1
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+add x0, x27, 1
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+add x0, x27, 1
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+add x0, x27, 1
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+add x0, x27, 1
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+add x0, x27, 1
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+add x0, x27, 1
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+add x0, x27, 1
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+add x0, x27, 1
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+add x0, x27, 1
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+add x0, x27, 1
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+add x0, x27, 1
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+add x0, x27, 1
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+add x0, x27, 1
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+add x0, x27, 1
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+add x0, x27, 1
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+add x0, x27, 1
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+add x0, x27, 1
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+add x0, x27, 1
+stp  d1, d2, [x27], #496
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+add x0, x27, 1
+stp  s1, s2, [x27, #248]!
+add x0, x27, 1
+stp  d1, d2, [x27, #496]!
+add x0, x27, 1
+stp  q1, q2, [x27, #992]!
+add x0, x27, 1
+stp  w1, w2, [x27], #248
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+add x0, x27, 1
+stp  w1, w2, [x27, #248]!
+add x0, x27, 1
+stp  x1, x2, [x27, #496]!
+add x0, x27, 1
+str  b1, [x27], #254
+add x0, x27, 1
+str  h1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+add x0, x27, 1
+str  d1, [x27], #254
+add x0, x27, 1
+str  q1, [x27], #254
+add x0, x27, 1
+str  b1, [x27, #254]!
+add x0, x27, 1
+str  h1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+add x0, x27, 1
+str  d1, [x27, #254]!
+add x0, x27, 1
+str  q1, [x27, #254]!
+add x0, x27, 1
+str  w1, [x27], #254
+add x0, x27, 1
+str  x1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+add x0, x27, 1
+str  x1, [x27, #254]!
+add x0, x27, 1
+strb  w1, [x27], #254
+add x0, x27, 1
+strb  w1, [x27, #254]!
+add x0, x27, 1
+strh  w1, [x27], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+add x0, x27, 1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+add x0, x27, 1
+ldr  x2, [x1], #254
+add x0, x27, 1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     D========================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D========================eeeeeeER.   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.0   0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     D========================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D========================eeeeeeER.   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.0   0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     D========================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D========================eeeeeeER.   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.0   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     D========================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.8   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.7   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.7   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2300
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.77
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3104
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.81
+# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D============eER    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .   .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     D==================eER   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .D=======================eER  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.7   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 4.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     D============eER    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeeER   .    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     D===================eER  .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==================eeeeeeER  .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D========================eER .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D========================eeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.2   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 4.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeER    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D=============eER   .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D=============eeeeeeeER  .    .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D===================eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===================eeeeeeER .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=========================eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=========================eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D===============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.9   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.74
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeER    .    .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D=============eER   .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D=============eeeeeeeER  .    .    .  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D===================eER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===================eeeeeeeER.    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D==========================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D==========================eeeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .D==================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     35.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     17.4   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    .  .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.7   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    .  .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.7   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    .  .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.7   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    .  .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.7   0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D========================eeeeeeeeER.    .  .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     25.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.7   0.1    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.6   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2900
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.72
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2700
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.5   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.5   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.6   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.6   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.6   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.6   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.70
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D================eeeeeeeeER   .    .    .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     D========================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.6   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.95
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3700
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        3500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.87
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.07
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 5.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===============eeeeeeeeER  .    .    .    .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=======================eER .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D======================eeeeeeeeER    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==============================eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=============================eeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     30.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.8   0.1    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===============eeeeeeeeeER .    .    .    ..   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     . D========================eER.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=======================eeeeeeeeeER  .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D================================eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===============================eeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     33.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.5   0.1    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===============eeeeeeeeER  .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=======================eER .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D======================eeeeeeeeeER   .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===============================eER  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============================eeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=======================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.12
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total uOps:        3300
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.92
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 3.7
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .   .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D======================eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D======================eeeeeeER  .   .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     . D============================eER .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D============================eeeeeeER.   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     . D==================================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     35.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     18.8   0.1    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.61
+# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    ..   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    ..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D============eER    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    ..   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     D==================eER   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   ..   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     D========================eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======================eeeeER.   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .D===========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        1900
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.18
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .   .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eE--R   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .    .   .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D=====eER .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D=====eeeeER   .   .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     D======eE--R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D======eeeeeER .   .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     D===========eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D==========eeeeeER.   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .D===============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     7.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     11.0   0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.0    0.1    0.4       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     D========================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D========================eeeeeeER.   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.0   0.1    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D==================eeeeeeER   .  .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     D========================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D========================eeeeeeER.   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     D==============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.0   0.1    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D===eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     D====eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D====eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     D=====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    0.1    1.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D===eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     D====eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D====eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     D=====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    0.1    1.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      506
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D===eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     D====eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     D====eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     D=====eE--R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     6.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    0.1    1.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        1700
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D===eeE-R .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     D=====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D====eeER.   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.7    0.1    0.7       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     D======eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D======eeER  .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.7    0.1    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     D======eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D======eeER  .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.7    0.1    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     D======eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D======eeER  .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.7    0.1    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     D======eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.6    0.1    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.5    0.1    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     D======eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.6    0.1    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.39
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.59
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.1    0.1    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        4200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.18
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D====eeER  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.8    0.1    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.78
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D====eeER  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.8    0.1    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D==eER    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeER   .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===eeeeER    .   .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .D=======eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D======eeeeER.   .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     . D==========eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========eeeeER.   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D==============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.6    0.1    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     D============eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D============eeeeER .  .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.7   0.1    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.10
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     D============eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D============eeeeER .  .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.7   0.1    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.20
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     D============eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.6   0.1    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.30
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.5   0.1    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.20
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.5   0.1    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     D============eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D============eeeeER .  .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.7   0.1    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D========eeeeER.    .  .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     D============eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D============eeeeER .  .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===============eeeeER.   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.7   0.1    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.53
+# CHECK-NEXT: IPC:               0.46
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     D=====eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeER   ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     D=========eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D========eeeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .D============eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.5    0.2    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2404
+# CHECK-NEXT: Total uOps:        3800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.58
+# CHECK-NEXT: IPC:               0.42
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     D=========eER  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D========eeeeeER   .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .D=============eER  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=============eeeeeER   . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     . D=================eER  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D=================eeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D======================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     18.0   0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.9   0.1    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.54
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D========eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeeER    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D============eER   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D============eeeeER.    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D================eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D===============eeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.8   0.1    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total uOps:        3200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.52
+# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eER .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .   .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     D=========eER  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D========eeeeER    .   .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .D============eER   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D============eeeeER.   .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D================eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D===============eeeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     . D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.1   0.1    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total uOps:        3600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.63
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     D========eER   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=======eeeeER.    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D===========eER    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===========eeeeER .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     . D==============eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============eeeeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     . D====================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     10.3   0.1    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total uOps:        5800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.76
+# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeeeeER    .    .    .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .D============eER   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D===========eeeeeeER   .    .    ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=================eER  .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D================eeeeeeeER .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D=======================eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D======================eeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.1   0.1    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total uOps:        4800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.65
+# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: Block RThroughput: 9.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eER  .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D===eeeeeeER  .    .    .    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=========eER .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=========eeeeeeER .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==============eER.    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D==============eeeeeeeER    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     . D=====================eER   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D====================eeeeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total uOps:        5200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.62
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeER  .    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     . D==================eER .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D==================eeeeeeER .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .  D=======================eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=======================eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .  D=============================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     19.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     24.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.9   0.1    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total uOps:        4000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D================eeeeeeER   .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     . D======================eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .  D===========================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     23.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.8   0.1    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total uOps:        2800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.40
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .  .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D======eER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D======eeeeeeER.    .  .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===========eeeeER .  .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .D===============eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===============eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===================eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     11.4   0.1    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      404
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.98
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeER. .   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER.   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     D====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.3    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      904
+# CHECK-NEXT: Total uOps:        2200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.43
+# CHECK-NEXT: IPC:               1.11
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    . .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D==eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   . .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D====eER  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER . .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     D======eER. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=====eeER .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .D=======eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .D========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.5    0.1    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.84
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.    .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eER    .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     D===eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D===eeER  .   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .D====eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D====eeER.   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .D======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.6    0.1    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .  .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER .  .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     D======eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D======eeER  .   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D=======eeER.   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D=========eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.7    0.1    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    2.49
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    ..   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D==eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   ..   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D====eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D====eeER ..   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D======eER..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D======eER..   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D======eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D======eER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .D=======eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.3    0.1    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1000
+# CHECK-NEXT: Total Cycles:      504
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eER  .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     D===eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     D===eER .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.2    0.1    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      104
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    3.85
+# CHECK-NEXT: IPC:               1.92
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234
+
+# CHECK:      [0,0]     DeER.   strh	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.5    0.0       <total>
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      110
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    5.45
+# CHECK-NEXT: IPC:               3.64
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D====eeeeER   ldr	x2, [x1], #254
+# CHECK-NEXT: [0,3]     D=eE------R   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.3    2.0       <total>


        


More information about the llvm-commits mailing list