[llvm] 2c1f37b - [test] precommit sched model for tsv110, NFC

via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 12 06:44:51 PDT 2023


Author: zhongyunde 00443407
Date: 2023-10-12T21:42:50+08:00
New Revision: 2c1f37b3b95233c9e8eb9d51b50ac37640919eba

URL: https://github.com/llvm/llvm-project/commit/2c1f37b3b95233c9e8eb9d51b50ac37640919eba
DIFF: https://github.com/llvm/llvm-project/commit/2c1f37b3b95233c9e8eb9d51b50ac37640919eba.diff

LOG: [test] precommit sched model for tsv110, NFC

Added: 
    llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-writeback.s

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-writeback.s b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-writeback.s
new file mode 100644
index 000000000000000..8a7022aaca05138
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-writeback.s
@@ -0,0 +1,3959 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=tsv110 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1  { v1.1d }, [x27], #8
+ld1  { v1.2d }, [x27], #16
+ld1  { v1.2s }, [x27], #8
+ld1  { v1.4h }, [x27], #8
+ld1  { v1.4s }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1  { v1.8b }, [x27], #8
+ld1  { v1.8h }, [x27], #16
+ld1  { v1.16b }, [x27], #16
+ld1  { v1.1d }, [x27], x28
+ld1  { v1.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1  { v1.2s }, [x27], x28
+ld1  { v1.4h }, [x27], x28
+ld1  { v1.4s }, [x27], x28
+ld1  { v1.8b }, [x27], x28
+ld1  { v1.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1  { v1.16b }, [x27], x28
+ld1  { v1.1d, v2.1d }, [x27], #16
+ld1  { v1.2d, v2.2d }, [x27], #32
+ld1  { v1.2s, v2.2s }, [x27], #16
+ld1  { v1.4h, v2.4h }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1  { v1.4s, v2.4s }, [x27], #32
+ld1  { v1.8b, v2.8b }, [x27], #16
+ld1  { v1.8h, v2.8h }, [x27], #32
+ld1  { v1.16b, v2.16b }, [x27], #32
+ld1  { v1.1d, v2.1d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1  { v1.2d, v2.2d }, [x27], x28
+ld1  { v1.2s, v2.2s }, [x27], x28
+ld1  { v1.4h, v2.4h }, [x27], x28
+ld1  { v1.4s, v2.4s }, [x27], x28
+ld1  { v1.8b, v2.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1  { v1.8h, v2.8h }, [x27], x28
+ld1  { v1.16b, v2.16b }, [x27], x28
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+ld1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+ld1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+ld1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+ld1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+ld1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+ld1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+ld1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+ld1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+ld1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+ld1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+ld1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+ld1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+ld1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+ld1  { v1.b }[0], [x27], #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1  { v1.b }[8], [x27], #1
+ld1  { v1.b }[0], [x27], x28
+ld1  { v1.b }[8], [x27], x28
+ld1  { v1.h }[0], [x27], #2
+ld1  { v1.h }[4], [x27], #2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1  { v1.h }[0], [x27], x28
+ld1  { v1.h }[4], [x27], x28
+ld1  { v1.s }[0], [x27], #4
+ld1  { v1.s }[0], [x27], x28
+ld1  { v1.d }[0], [x27], #8
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1  { v1.d }[0], [x27], x28
+ld1r  { v1.1d }, [x27], #8
+ld1r  { v1.2d }, [x27], #8
+ld1r  { v1.2s }, [x27], #4
+ld1r  { v1.4h }, [x27], #2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r  { v1.4s }, [x27], #4
+ld1r  { v1.8b }, [x27], #1
+ld1r  { v1.8h }, [x27], #2
+ld1r  { v1.16b }, [x27], #1
+ld1r  { v1.1d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r  { v1.2d }, [x27], x28
+ld1r  { v1.2s }, [x27], x28
+ld1r  { v1.4h }, [x27], x28
+ld1r  { v1.4s }, [x27], x28
+ld1r  { v1.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r  { v1.8h }, [x27], x28
+ld1r  { v1.16b }, [x27], x28
+ld2  { v1.2d, v2.2d }, [x27], #32
+ld2  { v1.2s, v2.2s }, [x27], #16
+ld2  { v1.4h, v2.4h }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2  { v1.4s, v2.4s }, [x27], #32
+ld2  { v1.8b, v2.8b }, [x27], #16
+ld2  { v1.8h, v2.8h }, [x27], #32
+ld2  { v1.16b, v2.16b }, [x27], #32
+ld2  { v1.2d, v2.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2  { v1.2s, v2.2s }, [x27], x28
+ld2  { v1.4h, v2.4h }, [x27], x28
+ld2  { v1.4s, v2.4s }, [x27], x28
+ld2  { v1.8b, v2.8b }, [x27], x28
+ld2  { v1.8h, v2.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2  { v1.16b, v2.16b }, [x27], x28
+ld2  { v1.b, v2.b }[0], [x27], #2
+ld2  { v1.b, v2.b }[8], [x27], #2
+ld2  { v1.b, v2.b }[0], [x27], x28
+ld2  { v1.b, v2.b }[8], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2  { v1.h, v2.h }[0], [x27], #4
+ld2  { v1.h, v2.h }[4], [x27], #4
+ld2  { v1.h, v2.h }[0], [x27], x28
+ld2  { v1.h, v2.h }[4], [x27], x28
+ld2  { v1.s, v2.s }[0], [x27], #8
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2  { v1.s, v2.s }[0], [x27], x28
+ld2  { v1.d, v2.d }[0], [x27], #16
+ld2  { v1.d, v2.d }[0], [x27], x28
+ld2r  { v1.1d, v2.1d }, [x27], #16
+ld2r  { v1.2d, v2.2d }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r  { v1.2s, v2.2s }, [x27], #8
+ld2r  { v1.4h, v2.4h }, [x27], #4
+ld2r  { v1.4s, v2.4s }, [x27], #8
+ld2r  { v1.8b, v2.8b }, [x27], #2
+ld2r  { v1.8h, v2.8h }, [x27], #4
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r  { v1.16b, v2.16b }, [x27], #2
+ld2r  { v1.1d, v2.1d }, [x27], x28
+ld2r  { v1.2d, v2.2d }, [x27], x28
+ld2r  { v1.2s, v2.2s }, [x27], x28
+ld2r  { v1.4h, v2.4h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r  { v1.4s, v2.4s }, [x27], x28
+ld2r  { v1.8b, v2.8b }, [x27], x28
+ld2r  { v1.8h, v2.8h }, [x27], x28
+ld2r  { v1.16b, v2.16b }, [x27], x28
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+ld3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+ld3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+ld3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+ld3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+ld3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+ld3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+ld3  { v1.b, v2.b, v3.b }[0], [x27], #3
+ld3  { v1.b, v2.b, v3.b }[8], [x27], #3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3  { v1.b, v2.b, v3.b }[0], [x27], x28
+ld3  { v1.b, v2.b, v3.b }[8], [x27], x28
+ld3  { v1.h, v2.h, v3.h }[0], [x27], #6
+ld3  { v1.h, v2.h, v3.h }[4], [x27], #6
+ld3  { v1.h, v2.h, v3.h }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3  { v1.h, v2.h, v3.h }[4], [x27], x28
+ld3  { v1.s, v2.s, v3.s }[0], [x27], #12
+ld3  { v1.s, v2.s, v3.s }[0], [x27], x28
+ld3  { v1.d, v2.d, v3.d }[0], [x27], #24
+ld3  { v1.d, v2.d, v3.d }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], #24
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], #24
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], #12
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], #6
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], #12
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], #3
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], #6
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], #3
+ld3r  { v1.1d, v2.1d, v3.1d }, [x27], x28
+ld3r  { v1.2d, v2.2d, v3.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r  { v1.2s, v2.2s, v3.2s }, [x27], x28
+ld3r  { v1.4h, v2.4h, v3.4h }, [x27], x28
+ld3r  { v1.4s, v2.4s, v3.4s }, [x27], x28
+ld3r  { v1.8b, v2.8b, v3.8b }, [x27], x28
+ld3r  { v1.8h, v2.8h, v3.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r  { v1.16b, v2.16b, v3.16b }, [x27], x28
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+ld4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+ld4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+ld4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+ld4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+ld4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+ld4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+ld4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+ld4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+ld4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+ld4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+ld4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+ld4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+ld4r  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+ld4r  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+ld4r  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+ld4r  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+ld4r  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+ld4r  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+ldp  s1, s2, [x27], #248
+ldp  d1, d2, [x27], #496
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp  q1, q2, [x27], #992
+ldp  s1, s2, [x27, #248]!
+ldp  d1, d2, [x27, #496]!
+ldp  q1, q2, [x27, #992]!
+ldp  w1, w2, [x27], #248
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp  x1, x2, [x27], #496
+ldp  w1, w2, [x27, #248]!
+ldp  x1, x2, [x27, #496]!
+ldpsw  x1, x2, [x27], #248
+ldpsw  x1, x2, [x27, #248]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr  b1, [x27], #254
+ldr  h1, [x27], #254
+ldr  s1, [x27], #254
+ldr  d1, [x27], #254
+ldr  q1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr  b1, [x27, #254]!
+ldr  h1, [x27, #254]!
+ldr  s1, [x27, #254]!
+ldr  d1, [x27, #254]!
+ldr  q1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr  w1, [x27], #254
+ldr  x1, [x27], #254
+ldr  w1, [x27, #254]!
+ldr  x1, [x27, #254]!
+ldrb  w1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb  w1, [x27, #254]!
+ldrh  w1, [x27], #254
+ldrh  w1, [x27, #254]!
+ldrsb  w1, [x27], #254
+ldrsb  x1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb  w1, [x27, #254]!
+ldrsb  x1, [x27, #254]!
+ldrsh  w1, [x27], #254
+ldrsh  x1, [x27], #254
+ldrsh  w1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh  x1, [x27, #254]!
+ldrsw  x1, [x27], #254
+ldrsw  x1, [x27, #254]!
+st1  { v1.1d }, [x27], #8
+st1  { v1.2d }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1  { v1.2s }, [x27], #8
+st1  { v1.4h }, [x27], #8
+st1  { v1.4s }, [x27], #16
+st1  { v1.8b }, [x27], #8
+st1  { v1.8h }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1  { v1.16b }, [x27], #16
+st1  { v1.1d }, [x27], x28
+st1  { v1.2d }, [x27], x28
+st1  { v1.2s }, [x27], x28
+st1  { v1.4h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1  { v1.4s }, [x27], x28
+st1  { v1.8b }, [x27], x28
+st1  { v1.8h }, [x27], x28
+st1  { v1.16b }, [x27], x28
+st1  { v1.1d, v2.1d }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1  { v1.2d, v2.2d }, [x27], #32
+st1  { v1.2s, v2.2s }, [x27], #16
+st1  { v1.4h, v2.4h }, [x27], #16
+st1  { v1.4s, v2.4s }, [x27], #32
+st1  { v1.8b, v2.8b }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1  { v1.8h, v2.8h }, [x27], #32
+st1  { v1.16b, v2.16b }, [x27], #32
+st1  { v1.1d, v2.1d }, [x27], x28
+st1  { v1.2d, v2.2d }, [x27], x28
+st1  { v1.2s, v2.2s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1  { v1.4h, v2.4h }, [x27], x28
+st1  { v1.4s, v2.4s }, [x27], x28
+st1  { v1.8b, v2.8b }, [x27], x28
+st1  { v1.8h, v2.8h }, [x27], x28
+st1  { v1.16b, v2.16b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1  { v1.1d, v2.1d, v3.1d }, [x27], #24
+st1  { v1.2d, v2.2d, v3.2d }, [x27], #48
+st1  { v1.2s, v2.2s, v3.2s }, [x27], #24
+st1  { v1.4h, v2.4h, v3.4h }, [x27], #24
+st1  { v1.4s, v2.4s, v3.4s }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1  { v1.8b, v2.8b, v3.8b }, [x27], #24
+st1  { v1.8h, v2.8h, v3.8h }, [x27], #48
+st1  { v1.16b, v2.16b, v3.16b }, [x27], #48
+st1  { v1.1d, v2.1d, v3.1d }, [x27], x28
+st1  { v1.2d, v2.2d, v3.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1  { v1.2s, v2.2s, v3.2s }, [x27], x28
+st1  { v1.4h, v2.4h, v3.4h }, [x27], x28
+st1  { v1.4s, v2.4s, v3.4s }, [x27], x28
+st1  { v1.8b, v2.8b, v3.8b }, [x27], x28
+st1  { v1.8h, v2.8h, v3.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1  { v1.16b, v2.16b, v3.16b }, [x27], x28
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+st1  { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+st1  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+st1  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+st1  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+st1  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+st1  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+st1  { v1.b }[0], [x27], #1
+st1  { v1.b }[8], [x27], #1
+st1  { v1.b }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1  { v1.b }[8], [x27], x28
+st1  { v1.h }[0], [x27], #2
+st1  { v1.h }[4], [x27], #2
+st1  { v1.h }[0], [x27], x28
+st1  { v1.h }[4], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1  { v1.s }[0], [x27], #4
+st1  { v1.s }[0], [x27], x28
+st1  { v1.d }[0], [x27], #8
+st1  { v1.d }[0], [x27], x28
+st2  { v1.2d, v2.2d }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2  { v1.2s, v2.2s }, [x27], #16
+st2  { v1.4h, v2.4h }, [x27], #16
+st2  { v1.4s, v2.4s }, [x27], #32
+st2  { v1.8b, v2.8b }, [x27], #16
+st2  { v1.8h, v2.8h }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2  { v1.16b, v2.16b }, [x27], #32
+st2  { v1.2d, v2.2d }, [x27], x28
+st2  { v1.2s, v2.2s }, [x27], x28
+st2  { v1.4h, v2.4h }, [x27], x28
+st2  { v1.4s, v2.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2  { v1.8b, v2.8b }, [x27], x28
+st2  { v1.8h, v2.8h }, [x27], x28
+st2  { v1.16b, v2.16b }, [x27], x28
+st2  { v1.b, v2.b }[0], [x27], #2
+st2  { v1.b, v2.b }[8], [x27], #2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2  { v1.b, v2.b }[0], [x27], x28
+st2  { v1.b, v2.b }[8], [x27], x28
+st2  { v1.h, v2.h }[0], [x27], #4
+st2  { v1.h, v2.h }[4], [x27], #4
+st2  { v1.h, v2.h }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2  { v1.h, v2.h }[4], [x27], x28
+st2  { v1.s, v2.s }[0], [x27], #8
+st2  { v1.s, v2.s }[0], [x27], x28
+st2  { v1.d, v2.d }[0], [x27], #16
+st2  { v1.d, v2.d }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st3  { v1.2d, v2.2d, v3.2d }, [x27], #48
+st3  { v1.2s, v2.2s, v3.2s }, [x27], #24
+st3  { v1.4h, v2.4h, v3.4h }, [x27], #24
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3  { v1.4s, v2.4s, v3.4s }, [x27], #48
+st3  { v1.8b, v2.8b, v3.8b }, [x27], #24
+st3  { v1.8h, v2.8h, v3.8h }, [x27], #48
+st3  { v1.16b, v2.16b, v3.16b }, [x27], #48
+st3  { v1.2d, v2.2d, v3.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3  { v1.2s, v2.2s, v3.2s }, [x27], x28
+st3  { v1.4h, v2.4h, v3.4h }, [x27], x28
+st3  { v1.4s, v2.4s, v3.4s }, [x27], x28
+st3  { v1.8b, v2.8b, v3.8b }, [x27], x28
+st3  { v1.8h, v2.8h, v3.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3  { v1.16b, v2.16b, v3.16b }, [x27], x28
+st3  { v1.b, v2.b, v3.b }[0], [x27], #3
+st3  { v1.b, v2.b, v3.b }[8], [x27], #3
+st3  { v1.b, v2.b, v3.b }[0], [x27], x28
+st3  { v1.b, v2.b, v3.b }[8], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3  { v1.h, v2.h, v3.h }[0], [x27], #6
+st3  { v1.h, v2.h, v3.h }[4], [x27], #6
+st3  { v1.h, v2.h, v3.h }[0], [x27], x28
+st3  { v1.h, v2.h, v3.h }[4], [x27], x28
+st3  { v1.s, v2.s, v3.s }[0], [x27], #12
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3  { v1.s, v2.s, v3.s }[0], [x27], x28
+st3  { v1.d, v2.d, v3.d }[0], [x27], #24
+st3  { v1.d, v2.d, v3.d }[0], [x27], x28
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4  { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+st4  { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+st4  { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+st4  { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+st4  { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4  { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+st4  { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+st4  { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4  { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+st4  { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+st4  { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+st4  { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+st4  { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stp  s1, s2, [x27], #248
+stp  d1, d2, [x27], #496
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp  q1, q2, [x27], #992
+stp  s1, s2, [x27, #248]!
+stp  d1, d2, [x27, #496]!
+stp  q1, q2, [x27, #992]!
+stp  w1, w2, [x27], #248
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp  x1, x2, [x27], #496
+stp  w1, w2, [x27, #248]!
+stp  x1, x2, [x27, #496]!
+str  b1, [x27], #254
+str  h1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str  s1, [x27], #254
+str  d1, [x27], #254
+str  q1, [x27], #254
+str  b1, [x27, #254]!
+str  h1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str  s1, [x27, #254]!
+str  d1, [x27, #254]!
+str  q1, [x27, #254]!
+str  w1, [x27], #254
+str  x1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str  w1, [x27, #254]!
+str  x1, [x27, #254]!
+strb  w1, [x27], #254
+strb  w1, [x27, #254]!
+strh  w1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh  w1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr  x1, [x27], #254
+ldr  x2, [x1], #254
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - G01
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ld1	{ v1.4s }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [1] Code Region - G02
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ld1	{ v1.2d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [2] Code Region - G03
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ld1	{ v1.8h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [3] Code Region - G04
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [4] Code Region - G05
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [5] Code Region - G06
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [6] Code Region - G07
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2803
+# CHECK-NEXT: Total uOps:        1600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=========eeeeeeER .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,3]     . D==============eeeeeeER.    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,4]     .  D===================eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 4.     1     20.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT:        1     10.4   0.2    0.0       <total>
+
+# CHECK:      [7] Code Region - G08
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .D=====eeeeeeER.    .    .    . .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,2]     . D==========eeeeeeER    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .  D===============eeeeeeER   . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,4]     .   D====================eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 4.     1     21.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT:        1     11.0   0.2    0.0       <total>
+
+# CHECK:      [8] Code Region - G09
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eeeeeeER.    .    .    . .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     . D==========eeeeeeER    .    . .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eeeeeeER   . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D====================eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     21.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT:        1     11.0   0.2    0.0       <total>
+
+# CHECK:      [9] Code Region - G10
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eeeeeeER.    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     . D==========eeeeeeER    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eeeeeeER   . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,4]     .   D====================eeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 4.     1     21.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT:        1     11.0   0.2    0.0       <total>
+
+# CHECK:      [10] Code Region - G11
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .D=====eeeeeeER.    .    .    . .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,2]     . D==========eeeeeeER    .    . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .  D===============eeeeeeER   . .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,4]     .   D====================eeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 4.     1     21.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT:        1     11.0   0.2    0.0       <total>
+
+# CHECK:      [11] Code Region - G12
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .D=====eeeeeeER.    .    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     . D==========eeeeeeER    .    . .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eeeeeeER   . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D====================eeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     21.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT:        1     11.0   0.2    0.0       <total>
+
+# CHECK:      [12] Code Region - G13
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3103
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.16
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     . D==========eeeeeeER    .    .  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D===============eeeeeeER   .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D====================eeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     16.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     21.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT:        1     11.0   0.2    0.0       <total>
+
+# CHECK:      [13] Code Region - G14
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [14] Code Region - G15
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [15] Code Region - G16
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [16] Code Region - G17
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [17] Code Region - G18
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [18] Code Region - G19
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        1800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.51
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [19] Code Region - G20
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [20] Code Region - G21
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [21] Code Region - G22
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [22] Code Region - G23
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [23] Code Region - G24
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [24] Code Region - G25
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [25] Code Region - G26
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total uOps:        2000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    . .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    . .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    . .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [26] Code Region - G27
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3603
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.58
+# CHECK-NEXT: IPC:               0.14
+# CHECK-NEXT: Block RThroughput: 5.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .D======eeeeeeeER   .    .    .    .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     . D============eeeeeeeER .    .    .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D========================eeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [27] Code Region - G28
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [28] Code Region - G29
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [29] Code Region - G30
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [30] Code Region - G31
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [31] Code Region - G32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [32] Code Region - G33
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [33] Code Region - G34
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [34] Code Region - G35
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        2500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [35] Code Region - G36
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4803
+# CHECK-NEXT: Total uOps:        4100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.85
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 10.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeeeER    .    .    .    .    .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,2]     .    D=============eeeeeeeeeeER    .    .    .    .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .  D====================eeeeeeeeeeER    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,4]     .    .    .D===========================eeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 2.     1     14.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 3.     1     21.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 4.     1     28.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT:        1     14.2   0.2    0.0       <total>
+
+# CHECK:      [36] Code Region - G37
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      5003
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.90
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 11.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeER  .    .    .    .    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .  D=======eeeeeeeeeeER  .    .    .    .    .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,2]     .    .D==============eeeeeeeeeeER  .    .    .    . .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .   D=====================eeeeeeeeeeER  .    . .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .    . D============================eeeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
+
+# CHECK:      [37] Code Region - G38
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      5003
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.90
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 11.3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeeER  .    .    .    .    .    .    .    . .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  D=======eeeeeeeeeeER  .    .    .    .    .    . .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,2]     .    .D==============eeeeeeeeeeER  .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   D=====================eeeeeeeeeeER  .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .    . D============================eeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
+
+# CHECK:      [38] Code Region - G39
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [39] Code Region - G40
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [40] Code Region - G41
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [41] Code Region - G42
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [42] Code Region - G43
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
+
+# CHECK:      [43] Code Region - G44
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3403
+# CHECK-NEXT: Total uOps:        2400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    ..   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeER   ..   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,4]     .    . D======================eeeeeER   ldp	d1, d2, [x27], #496
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 4.     1     23.0   0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT:        1     12.6   0.2    0.0       <total>
+
+# CHECK:      [44] Code Region - G45
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total uOps:        1600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .D====eeeeeER  .    .    ..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . D========eeeeeER  .    ..   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  D============eeeeeER  ..   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4]     .   D================eeeeER   ldp	w1, w2, [x27], #248
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 3.     1     13.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT:        1     9.0    0.2    0.0       <total>
+
+# CHECK:      [45] Code Region - G46
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.90
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .D===eeeeER    .    . .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . D======eeeeER.    . .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  D=========eeeeER . .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,4]     .   D============eeeeER   ldpsw	x1, x2, [x27, #248]!
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 3.     1     10.0   0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT:        1     7.0    0.2    0.0       <total>
+
+# CHECK:      [46] Code Region - G47
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ldr	q1, [x27], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [47] Code Region - G48
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   ldr	q1, [x27, #254]!
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [48] Code Region - G49
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   ldr	w1, [x27], #254
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==============eeeeER   ldrb	w1, [x27], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [49] Code Region - G50
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   ldrb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   ldrh	w1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==============eeeeER   ldrsb	x1, [x27], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       ldrh	w1, [x27], #254
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [50] Code Region - G51
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==============eeeeER   ldrsh	w1, [x27, #254]!
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       ldrsb	x1, [x27, #254]!
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [51] Code Region - G52
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeeER   .    .    .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D===========eeeER  .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,4]     . D=============eeeER   st1	{ v1.2d }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       ldrsw	x1, [x27], #254
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT:        1     8.0    0.2    0.0       <total>
+
+# CHECK:      [52] Code Region - G53
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    . .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D===eeeER .    . .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,2]     .D=====eeeER   . .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D========eeeER. .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,4]     . D==========eeeER   st1	{ v1.8h }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT:        1     6.2    0.2    0.0       <total>
+
+# CHECK:      [53] Code Region - G54
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    . .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     D===eeeER .    . .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=====eeeER   . .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D========eeeER. .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==========eeeER   st1	{ v1.4h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT:        1     6.2    0.2    0.0       <total>
+
+# CHECK:      [54] Code Region - G55
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D===eeeER .    .  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=====eeeER   .  .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D========eeeER.  .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==========eeeeER   st1	{ v1.1d, v2.1d }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT:        1     6.2    0.2    0.0       <total>
+
+# CHECK:      [55] Code Region - G56
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st1	{ v1.8b, v2.8b }, [x27], #16
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [56] Code Region - G57
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st1	{ v1.2s, v2.2s }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [57] Code Region - G58
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st1	{ v1.16b, v2.16b }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [58] Code Region - G59
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [59] Code Region - G60
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [60] Code Region - G61
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [61] Code Region - G62
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2903
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeeER .    .    .    ..   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,2]     .D==========eeeeeeER.    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .D================eeeeeeER    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,4]     . D=====================eeeeeeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 2.     1     11.0   0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     17.0   0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 4.     1     22.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT:        1     11.4   0.2    0.0       <total>
+
+# CHECK:      [62] Code Region - G63
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     D======eeeeeeER.    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,2]     .D===========eeeeeeER    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .D=================eeeeeeER   . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,4]     . D======================eeeeeeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 2.     1     12.0   0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     18.0   0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 4.     1     23.0   0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT:        1     12.2   0.2    0.0       <total>
+
+# CHECK:      [63] Code Region - G64
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eeeeeeER.    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .D===========eeeeeeER    .    . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=================eeeeeeER   . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D======================eeeeeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     12.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     18.0   0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     23.0   0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT:        1     12.2   0.2    0.0       <total>
+
+# CHECK:      [64] Code Region - G65
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D======eeeeeeER.    .  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D===========eeeER  .  .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,3]     .D==============eeeER  .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,4]     . D================eeeER   st1	{ v1.b }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2.     1     12.0   0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 4.     1     17.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT:        1     10.4   0.2    0.0       <total>
+
+# CHECK:      [65] Code Region - G66
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    . .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D===eeeER .    . .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,2]     .D=====eeeER   . .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,3]     .D========eeeER. .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     . D==========eeeER   st1	{ v1.h }[4], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT:        1     6.2    0.2    0.0       <total>
+
+# CHECK:      [66] Code Region - G67
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .  .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D===eeeER .    .  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .D=====eeeER   .  .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D========eeeER.  .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,4]     . D==========eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 1.     1     4.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 3.     1     9.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT:        1     6.2    0.2    0.0       <total>
+
+# CHECK:      [67] Code Region - G68
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [68] Code Region - G69
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [69] Code Region - G70
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [70] Code Region - G71
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [71] Code Region - G72
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeER   .    .    . .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D====eeeeER    .    . .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,2]     .D=======eeeeER.    . .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D===========eeeeER . .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,4]     . D==============eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT:        1     8.2    0.2    0.0       <total>
+
+# CHECK:      [72] Code Region - G73
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    . .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     D=====eeeeeER  . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,2]     .D=========eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT:        1     5.7    0.3    0.0       <total>
+
+# CHECK:      [73] Code Region - G74
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [74] Code Region - G75
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [75] Code Region - G76
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [76] Code Region - G77
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D==============eeeeeER  . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,4]     . D==================eeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 4.     1     19.0   0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT:        1     10.2   0.2    0.0       <total>
+
+# CHECK:      [77] Code Region - G78
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3103
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.16
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .  .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,2]     .D=========eeeeeER  .    .    .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D==============eeeeeeeeER    .  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,4]     . D=====================eeeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 2.     1     10.0   0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 4.     1     22.0   0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT:        1     10.8   0.2    0.0       <total>
+
+# CHECK:      [78] Code Region - G79
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     24.0   0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 4.     1     31.0   0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+
+# CHECK:      [79] Code Region - G80
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.12
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     24.0   0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     31.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+
+# CHECK:      [80] Code Region - G81
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3403
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D===============eeeeeeER.    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .D=====================eeeeeeER    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,4]     . D==========================eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1.     1     9.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 4.     1     27.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
+
+# CHECK:      [81] Code Region - G82
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D======eeeeeeER.    .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,2]     .D===========eeeeeeER    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,3]     .D=================eeeeeeER   . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     . D======================eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 2.     1     12.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 3.     1     18.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     23.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT:        1     12.2   0.2    0.0       <total>
+
+# CHECK:      [82] Code Region - G83
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D======eeeeeeER.    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .D===========eeeeeeER    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,3]     .D=================eeeeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     12.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 3.     1     18.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT:        1     9.5    0.3    0.0       <total>
+
+# CHECK:      [83] Code Region - G84
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.49
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeeER..   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     .D=eeER   stp	d1, d2, [x27], #496
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT:        1     1.5    0.5    0.0       <total>
+
+# CHECK:      [84] Code Region - G85
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      903
+# CHECK-NEXT: Total uOps:        1400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.55
+# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: Block RThroughput: 9.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    ..   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .D=eeER   ..   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . D==eeER ..   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  D===eeER.   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4]     .   D====eER   stp	w1, w2, [x27], #248
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 4.     1     5.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+
+# CHECK:      [85] Code Region - G86
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeER . .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eER. .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,2]     .D=eER .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .D==eER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==eER   str	h1, [x27], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+
+# CHECK:      [86] Code Region - G87
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeER . .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eER. .   str	d1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==eER   str	h1, [x27, #254]!
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+
+# CHECK:      [87] Code Region - G88
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeER . .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER. .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,2]     .D=eER .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D==eER.   str	w1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==eER   str	x1, [x27], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+
+# CHECK:      [88] Code Region - G89
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      500
+# CHECK-NEXT: Total Cycles:      503
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeER . .   str	w1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER. .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,2]     .D=eER .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eER.   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==eER   strh	w1, [x27], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+
+# CHECK:      [89] Code Region - G90
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      103
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123
+
+# CHECK:      [0,0]     DeER   strh	w1, [x27, #254]!
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
+
+# CHECK:      [90] Code Region - G91
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      404
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.99
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeeeeER.   ldr	x1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eeeeER   ldr	x2, [x1], #254
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	x2, [x1], #254
+# CHECK-NEXT:        1     1.5    0.5    0.0       <total>


        


More information about the llvm-commits mailing list