[llvm-branch-commits] [flang] [llvm] [flang][OpenMP] Map basic `local` specifiers to `private` clauses (PR #142735)

Wed Jun 11 00:43:51 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/142735

>From 32ac7dc2d21843091116b636777c174830cd2dd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson at ericsson.com>
Date: Wed, 11 Jun 2025 09:24:03 +0200
Subject: [PATCH 1/9] [test][AArch64] Adjust vector insertion lit tests
 (#143101)

The test cases test_insert_v16i8_insert_2_undef_base and
test_insert_v16i8_insert_2_undef_base_different_valeus in
CodeGen/AArch64/arm64-vector-insertion.ll was leaving element 8 in the
vector as "undef" without any real explanation. It kind of looked like a
typo as the input IR looked like this
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
leaving %v.8 as unused.

This patch is cleaning up the tests a bit by adding separate test cases
to validate what is happening when skipping insert at index 8, while
amending the original tests cases to use %v.8 instead of %v.7 when
creating %v.10.
---
 .../CodeGen/AArch64/arm64-vector-insertion.ll | 69 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index 94074d1689f6a..ff28c7817d143 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -66,6 +66,35 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
   %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
   %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+  %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+  %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+  %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
+  %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+  %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+  %v.15 = insertelement <16 x i8> %v.14, i8 %a, i32 15
+  ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a may be poison, so simply inserting %a also at
+; index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_skip8(i32 %a0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_skip8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #5
+; CHECK-NEXT:    dup.16b v0, w8
+; CHECK-NEXT:    mov.b v0[5], wzr
+; CHECK-NEXT:    mov.b v0[9], wzr
+; CHECK-NEXT:    ret
+  %a1 = lshr exact i32 %a0, 5
+  %a = trunc i32 %a1 to i8
+  %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>  , i8 %a, i32 0
+  %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+  %v.2 = insertelement <16 x i8> %v.1, i8 %a, i32 2
+  %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+  %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+  %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+  %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
   %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
   %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
@@ -75,8 +104,8 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
   ret <16 x i8> %v.15
 }
 
-define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) {
-; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus:
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values(i8 %a, i8 %b) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup.16b v0, w0
 ; CHECK-NEXT:    mov.b v0[2], w1
@@ -94,6 +123,42 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a,
   %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
   %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+  %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+  %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+  %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12
+  %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+  %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+  %v.15 = insertelement <16 x i8> %v.14, i8 %b, i32 15
+  ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a and %b may be poison, so simply inserting %a or %b
+; at index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values_skip8(i32 %a0, i32 %b0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values_skip8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #5
+; CHECK-NEXT:    dup.16b v0, w8
+; CHECK-NEXT:    lsr w8, w1, #5
+; CHECK-NEXT:    mov.b v0[2], w8
+; CHECK-NEXT:    mov.b v0[5], wzr
+; CHECK-NEXT:    mov.b v0[7], w8
+; CHECK-NEXT:    mov.b v0[9], wzr
+; CHECK-NEXT:    mov.b v0[12], w8
+; CHECK-NEXT:    mov.b v0[15], w8
+; CHECK-NEXT:    ret
+  %a1 = lshr exact i32 %a0, 5
+  %a = trunc i32 %a1 to i8
+  %b1 = lshr exact i32 %b0, 5
+  %b = trunc i32 %b1 to i8
+  %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>  , i8 %a, i32 0
+  %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+  %v.2 = insertelement <16 x i8> %v.1, i8 %b, i32 2
+  %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+  %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+  %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+  %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
   %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
   %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12

>From 686ec6cfe86367c43dccd83d7e6e2bac7e6a73a0 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis at arm.com>
Date: Wed, 11 Jun 2025 08:24:10 +0100
Subject: [PATCH 2/9] [BOLT][AArch64] Fix adr-relaxation.s test (#143151)

On some AArch64 machines the splitting was inconsistent.
This causes cold `foo` to have a `mov` instruction before adrp.

```
<foo.cold.0>:
  mov     x0, #0x0                // =0
  adrp    x1, 0x600000 <_start>
  add     x1, x1, #0x14
  ret
```

This patch removes the `mov` instruction right above .L2, making
splitting deterministic.
---
 bolt/test/AArch64/adr-relaxation.s | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bolt/test/AArch64/adr-relaxation.s b/bolt/test/AArch64/adr-relaxation.s
index a643a62339ba3..864650c3287d8 100644
--- a/bolt/test/AArch64/adr-relaxation.s
+++ b/bolt/test/AArch64/adr-relaxation.s
@@ -34,7 +34,6 @@ foo:
   .cfi_startproc
   cmp  x1, x11
   b.hi  .L2
-  mov  x0, #0x0
 .L2:
 # CHECK-FOO: <foo.cold.0>:
 # CHECK-FOO-NEXT: adrp

>From 521e6ce5c8fdfc72cccc1accd78a59f1a5e2805a Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002 at gmail.com>
Date: Wed, 11 Jun 2025 10:25:29 +0300
Subject: [PATCH 3/9] [CI] Add mention of LLVM Developer Policy in email-check
 message (NFC) (#143300)

As for now, It may be hard for people to get truth from long Discourse
discussion, so a link to official document may be enough to convince
changing email from private to public.
---
 .github/workflows/email-check.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index f4481d5cf5583..904ad718f97dd 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -32,7 +32,8 @@ jobs:
           COMMENT: >-
             ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.<br/>
             Please turn off [Keep my email addresses private](https://github.com/settings/emails) setting in your account.<br/>
-            See [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
+            See [LLVM Developer Policy](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) and
+            [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
         run: |
           cat << EOF > comments
           [{"body" : "$COMMENT"}]

>From 17f1dac805d388596be5e8c316c0f14b3222da4e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:12:42 +0100
Subject: [PATCH 4/9] [X86] Add test coverage showing failure to merge "zero
 input passthrough" behaviour for BSF instructions on x86_64 targets

---
 llvm/test/CodeGen/X86/bsf.ll | 452 +++++++++++++++++++++++++++++++++++
 1 file changed, 452 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bsf.ll

diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
new file mode 100644
index 0000000000000..58929115baf54
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i8 @cmov_bsf8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsf8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    orl $256, %eax # imm = 0x100
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl $256, %eax # imm = 0x100
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 false)
+  %2 = icmp eq i8 %x, 0
+  %3 = select i1 %2, i8 %y, i8 %1
+  ret i8 %3
+}
+
+define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsf8_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    je .LBB1_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf8_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true)
+  %2 = icmp eq i8 %x, 0
+  %3 = select i1 %2, i8 %y, i8 %1
+  ret i8 %3
+}
+
+define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsf16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    orl $65536, %eax # imm = 0x10000
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl $65536, %eax # imm = 0x10000
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+  %2 = icmp eq i16 %x, 0
+  %3 = select i1 %2, i16 %y, i16 %1
+  ret i16 %3
+}
+
+define i16 @cmov_bsf16_undef(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsf16_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB3_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf16_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %2 = icmp eq i16 %x, 0
+  %3 = select i1 %2, i16 %y, i16 %1
+  ret i16 %3
+}
+
+define i32 @cmov_bsf32(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsf32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB4_5
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB4_5: # %cond.end
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_4
+; X86-NEXT:    jmp .LBB4_5
+;
+; X64-LABEL: cmov_bsf32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %2 = icmp eq i32 %x, 0
+  %3 = select i1 %2, i32 %y, i32 %1
+  ret i32 %3
+}
+
+define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsf32_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf32_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %2 = icmp eq i32 %x, 0
+  %3 = select i1 %2, i32 %y, i32 %1
+  ret i32 %3
+}
+
+define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsf64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je .LBB6_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    jne .LBB6_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_6
+; X86-NEXT:    jmp .LBB6_7
+; X86-NEXT:  .LBB6_1:
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    jne .LBB6_7
+; X86-NEXT:  .LBB6_6:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB6_7: # %cond.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB6_3:
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_6
+; X86-NEXT:    jmp .LBB6_7
+;
+; X64-LABEL: cmov_bsf64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    bsfq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+  %2 = icmp eq i64 %x, 0
+  %3 = select i1 %2, i64 %y, i64 %1
+  ret i64 %3
+}
+
+define i64 @cmov_bsf64_undef(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsf64_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    je .LBB7_5
+; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB7_2
+; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_5: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf64_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsfq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %2 = icmp eq i64 %x, 0
+  %3 = select i1 %2, i64 %y, i64 %1
+  ret i64 %3
+}
+
+define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsf128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB8_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB8_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl %edi, %esi
+; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB8_7
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %eax, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_1:
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    jmp .LBB8_11
+; X86-NEXT:  .LBB8_3:
+; X86-NEXT:    rep bsfl %ecx, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB8_6
+; X86-NEXT:  .LBB8_7: # %cond.false
+; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:    addl $32, %edx
+; X86-NEXT:  .LBB8_8: # %cond.false
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB8_10
+; X86-NEXT:  # %bb.9: # %cond.false
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:  .LBB8_10: # %cond.false
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:  .LBB8_11: # %cond.end
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    jne .LBB8_13
+; X86-NEXT:  # %bb.12:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:  .LBB8_13: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %ebp, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsf128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    je .LBB8_2
+; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:    rep bsfq %rdi, %rcx
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    rep bsfq %rsi, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB8_2: # %select.end
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 false)
+  %2 = icmp eq i128 %x, 0
+  %3 = select i1 %2, i128 %y, i128 %1
+  ret i128 %3
+}
+
+define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsf128_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    je .LBB9_11
+; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB9_2
+; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:    rep bsfl %ecx, %edi
+; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB9_6
+; X86-NEXT:  .LBB9_5:
+; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    je .LBB9_8
+; X86-NEXT:    jmp .LBB9_9
+; X86-NEXT:  .LBB9_11: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    jmp .LBB9_10
+; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    rep bsfl %edx, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB9_5
+; X86-NEXT:  .LBB9_6: # %select.false.sink
+; X86-NEXT:    rep bsfl %esi, %esi
+; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    jne .LBB9_9
+; X86-NEXT:  .LBB9_8: # %select.false.sink
+; X86-NEXT:    addl $64, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:  .LBB9_9: # %select.false.sink
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:  .LBB9_10: # %select.false.sink
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsf128_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    je .LBB9_2
+; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:    rep bsfq %rdi, %rcx
+; X64-NEXT:    rep bsfq %rsi, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_2: # %select.end
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
+  %2 = icmp eq i128 %x, 0
+  %3 = select i1 %2, i128 %y, i128 %1
+  ret i128 %3
+}
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i128 @llvm.cttz.i128(i128, i1)

>From a72bcda1434c72f9db6687565a361479e0dde572 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:24:10 +0100
Subject: [PATCH 5/9] [X86] add test coverage for #143606

---
 .../X86/vector-shuffle-combining-avx512vl.ll  | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
index 15c82f169c86e..d5aa7588925d8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
@@ -137,3 +137,31 @@ define void @PR142995(ptr %p0, ptr %p1, ptr %p2) nounwind #0 {
 }
 declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr captures(none), i32 immarg, <5 x i1>, <5 x i32>)
 declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr captures(none), i32 immarg, <64 x i1>, <64 x i32>)
+
+define <8 x double> @PR143606(ptr %px, ptr %py) {
+; X86-LABEL: PR143606:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovapd (%ecx), %ymm0
+; X86-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
+; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR143606:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovapd (%rdi), %ymm0
+; X64-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
+; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X64-NEXT:    retq
+  %x = load <4 x double>, ptr %px, align 32
+  %y.lo = load <4 x double>, ptr %py, align 32
+  %py.hi = getelementptr inbounds nuw i8, ptr %py, i64 32
+  %y.hi = load <4 x double>, ptr %py.hi, align 32
+  %lo = shufflevector <4 x double> %x, <4 x double> %y.lo, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %hi = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x, <4 x i64> <i64 1, i64 4, i64 2, i64 7>, <4 x double> %y.hi)
+  %res = shufflevector <4 x double> %lo, <4 x double> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}

>From e9bd1aee6537508970614fd79a4f076ba4ed93d0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:30:09 +0100
Subject: [PATCH 6/9] [X86] bmi-select-distrib.ll - remove unused check
 prefixes and pull out PR comments above tests. NFC

---
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 31 +++++++++------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
index 49beda516d508..e5696ded4fbf1 100644
--- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86,X86-BMI
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86,X86-BMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64,X64-BMI2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
 
-define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -25,8 +25,8 @@ define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -46,8 +46,8 @@ define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -67,8 +67,8 @@ define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; PR131587
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -283,8 +283,8 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -304,8 +304,8 @@ define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -325,8 +325,8 @@ define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -346,8 +346,8 @@ define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -392,8 +392,8 @@ define i32 @and_sub_1_select_orig(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; PR133848
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -863,8 +863,3 @@ define i32 @xor_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2)
   %ret = xor i32 %a1, %bls
   ret i32 %ret
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-BMI: {{.*}}
-; X64-BMI2: {{.*}}
-; X86-BMI: {{.*}}
-; X86-BMI2: {{.*}}

>From 13115276d0d12b0d9bf952abdc19f04866db16a8 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 11 Jun 2025 08:32:55 +0100
Subject: [PATCH 7/9] Revert "[AArch64][GlobalISel] Expand 64bit extracts to
 128bit to allow more patterns (#142904)"

This reverts commit 61cdba602abe67761ab2bbf12bf85710dfa963f4 due to verifier
issues.
---
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  32 +--
 .../GlobalISel/regbank-extract-vector-elt.mir |   4 +-
 llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll  |   3 -
 llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll  |   3 -
 llvm/test/CodeGen/AArch64/abs.ll              |   1 -
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  13 +-
 .../AArch64/arm64-neon-simd-ldst-one.ll       |  45 ++--
 llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll |  55 +++--
 llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll     |   1 -
 llvm/test/CodeGen/AArch64/bswap.ll            |   1 -
 llvm/test/CodeGen/AArch64/concat-vector.ll    |   7 +-
 llvm/test/CodeGen/AArch64/double_reduct.ll    |  18 +-
 llvm/test/CodeGen/AArch64/f16-instructions.ll |  12 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fcopysign.ll        |   4 +-
 llvm/test/CodeGen/AArch64/fcvt.ll             |  14 +-
 llvm/test/CodeGen/AArch64/fdiv.ll             |   2 +-
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  |   4 +-
 llvm/test/CodeGen/AArch64/fminmax.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fmla.ll             |   6 +-
 llvm/test/CodeGen/AArch64/fmul.ll             |   2 +-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll |   1 -
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll |   1 -
 llvm/test/CodeGen/AArch64/fptrunc.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fsqrt.ll            |   2 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |  45 ++--
 llvm/test/CodeGen/AArch64/itofp.ll            |  20 +-
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |  33 ++-
 llvm/test/CodeGen/AArch64/popcount.ll         |   8 +-
 llvm/test/CodeGen/AArch64/ptradd.ll           |   1 -
 llvm/test/CodeGen/AArch64/shift.ll            |   6 -
 llvm/test/CodeGen/AArch64/store.ll            |  15 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 228 +++++++++++++-----
 .../CodeGen/AArch64/vecreduce-fadd-strict.ll  |   7 +-
 .../vecreduce-fmax-legalization-nan.ll        |  26 +-
 .../AArch64/vecreduce-fmax-legalization.ll    |  26 +-
 .../CodeGen/AArch64/vecreduce-fmaximum.ll     |  26 +-
 .../AArch64/vecreduce-fmin-legalization.ll    |  26 +-
 .../CodeGen/AArch64/vecreduce-fminimum.ll     |  26 +-
 .../CodeGen/AArch64/vecreduce-fmul-strict.ll  |  29 ++-
 llvm/test/CodeGen/AArch64/vecreduce-fmul.ll   | 121 ++++++----
 .../AArch64/vecreduce-umax-legalization.ll    |  15 +-
 llvm/test/CodeGen/AArch64/vector-lrint.ll     |  25 +-
 43 files changed, 592 insertions(+), 334 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 53c7a00a7f9f0..31954e7954c03 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -399,26 +399,6 @@ void AArch64RegisterBankInfo::applyMappingImpl(
     MI.getOperand(1).setReg(ConstReg);
     return applyDefaultMapping(OpdMapper);
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
-    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
-    // number of duplicate lane-extract patterns needed. Do the same here so
-    // that selection will operate on the larger vectors.
-    Register Src = MI.getOperand(1).getReg();
-    LLT SrcTy = MRI.getType(Src);
-    assert(SrcTy.getSizeInBits() == 64 && "Expected 64-bit source vector");
-    LLT DstTy = SrcTy.multiplyElements(2);
-    Builder.setInsertPt(*MI.getParent(), MI.getIterator());
-    auto Undef = Builder.buildUndef(SrcTy);
-    auto Concat = Builder.buildConcatVectors(DstTy, {Src, Undef.getReg(0)});
-    MRI.setRegBank(Undef.getReg(0), getRegBank(AArch64::FPRRegBankID));
-    MRI.setRegBank(Concat.getReg(0), getRegBank(AArch64::FPRRegBankID));
-    for (MachineInstr &Ext :
-         make_early_inc_range(MRI.use_nodbg_instructions(Src))) {
-      if (Ext.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT)
-        Ext.getOperand(1).setReg(Concat.getReg(0));
-    }
-    return applyDefaultMapping(OpdMapper);
-  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -1034,20 +1014,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     // Destination and source need to be FPRs.
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
-    // Index needs to be a GPR constant.
+
+    // Index needs to be a GPR.
     OpRegBankIdx[2] = PMI_FirstGPR;
-    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
-    // number of duplicate lane-extract patterns needed. Do the same here so
-    // that selection will operate on the larger vectors.
-    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
-    if (!Ty.isScalable() && Ty.getSizeInBits() == 64)
-      MappingID = CustomMappingID;
     break;
-  }
   case TargetOpcode::G_INSERT_VECTOR_ELT:
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
index 4e569e0bc7e5f..35bc36d472b1a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
@@ -94,9 +94,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr(<4 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:fpr(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>)
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[C]](s64)
     ; CHECK-NEXT: $h0 = COPY [[EVEC]](s16)
     ; CHECK-NEXT: RET_ReallyLR implicit $h0
     %0:_(<4 x s16>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 287344bdbd29f..7f922c0047553 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -70,9 +70,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bitf_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 73fcee56506f9..b8eb8269d605c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -70,9 +70,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bit_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 470d68a805718..0f56d25a47b2a 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -243,7 +243,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: abs_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 60af49d867be7..367105f783817 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1215,7 +1215,6 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: testDUP.v1i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -1711,7 +1710,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI127_0
-; CHECK-GI-NEXT:    mov b1, v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    mov v1.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v1.b[2], v0.b[2]
 ; CHECK-GI-NEXT:    mov v1.b[3], v0.b[3]
@@ -1818,7 +1817,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b2, v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v2.b[2], v0.b[2]
@@ -1904,7 +1903,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI131_0
-; CHECK-GI-NEXT:    mov h1, v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
@@ -1975,7 +1974,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
@@ -2037,7 +2036,7 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI135_0
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI135_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
@@ -2243,7 +2242,6 @@ define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v8i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -2270,7 +2268,6 @@ define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v16i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.16b, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index ac6f041ccd70d..f47c06e1ba4cb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -614,11 +614,16 @@ entry:
 }
 
 define void @test_vst1_lane0_s16(ptr %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane0_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str h0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str h0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <4 x i16> %b, i32 0
   store i16 %0, ptr %a, align 2
@@ -638,11 +643,16 @@ entry:
 }
 
 define void @test_vst1_lane0_s32(ptr %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane0_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <2 x i32> %b, i32 0
   store i32 %0, ptr %a, align 4
@@ -673,11 +683,16 @@ entry:
 }
 
 define void @test_vst1_lane0_f32(ptr %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane0_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <2 x float> %b, i32 0
   store float %0, ptr %a, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index 1f8ac792d75f5..cb14adc00df00 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -663,14 +663,24 @@ entry:
 }
 
 define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqrdmlahs_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    fmov s2, w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s2, w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    sqrdmlah s1, s2, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    sqrdmlah s1, s2, s0
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -803,14 +813,24 @@ entry:
 }
 
 define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqrdmlshs_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    fmov s2, w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s2, w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    sqrdmlsh s1, s2, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    sqrdmlsh s1, s2, s0
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -847,6 +867,3 @@ entry:
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
   ret i32 %vqrdmlshs_s32.i
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index eccf918f74312..d4cc154ac6afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -271,7 +271,6 @@ define half @test_vcvt_f16_f32(<1 x float> %x) {
 ;
 ; GISEL-LABEL: test_vcvt_f16_f32:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9ae4782b52bd9..898958fb4993f 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -207,7 +207,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: bswap_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 1e8dd0c78043a..acf15f1bd1178 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -13,10 +13,11 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    mov w8, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.h[3], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
    %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 2d146bf9aae89..f30895db2c098 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -65,8 +65,10 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v2.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -90,8 +92,10 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -918,8 +922,10 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
 ; CHECK-GI-NEXT:    mov d5, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v4.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v5.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s4, v0.s[1]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s4
+; CHECK-GI-NEXT:    fmul s1, s1, s5
 ; CHECK-GI-NEXT:    fmul s0, s0, s2
 ; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index aa120f2643950..adc536da26f26 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1496,7 +1496,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign:
@@ -1505,7 +1505,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %r = call half @llvm.copysign.f16(half %a, half %b)
   ret half %r
@@ -1536,7 +1536,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f32:
@@ -1545,7 +1545,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc float %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
@@ -1577,7 +1577,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f64:
@@ -1586,7 +1586,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc double %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 943073e2a603e..b15579199a059 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -196,7 +196,7 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
@@ -537,7 +537,7 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 7ac1f37af2e0b..3a5f7e2cd6b29 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -33,7 +33,7 @@ define float @copysign_f32(float %a, float %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call float @llvm.copysign.f32(float %a, float %b)
@@ -56,7 +56,7 @@ define half @copysign_f16(half %a, half %b) {
 ; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call half @llvm.copysign.f16(half %a, half %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 2c512de413aeb..b408e9c1bd4e6 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -169,7 +169,7 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
@@ -468,7 +468,7 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
@@ -767,7 +767,7 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
@@ -1066,7 +1066,7 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
@@ -1365,7 +1365,7 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
@@ -1664,7 +1664,7 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
@@ -1963,7 +1963,7 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index d232ca4d9c131..5bdccccc62b99 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -199,7 +199,7 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 1c7c55d12a864..fb12f8acf1745 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index da9b57223cff7..64f0da8b4cd0f 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index ef59209b69921..a37aabb0b5384 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -268,7 +268,7 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
@@ -873,7 +873,7 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
@@ -1358,7 +1358,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 51eba5666f681..bd3d1353e643e 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -196,7 +196,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index bcebbf4982eaa..9c21d2bf083a2 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,7 +31,6 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzs w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 38895eb7bd761..44847a41287d6 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,7 +31,6 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzu w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index a428c95c90387..1f84c944d7c16 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -263,7 +263,7 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -354,7 +354,7 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 1e888a4c0e193..6c5fd8e52b017 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -203,7 +203,7 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 1af36ccaefa30..5c89316e5f570 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -1478,11 +1478,16 @@ entry:
 }
 
 define float @extract_v2f32_0(<2 x float> %a, i32 %c) {
-; CHECK-LABEL: extract_v2f32_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v2f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <2 x float> %a, i32 0
   ret float %d
@@ -1681,11 +1686,16 @@ entry:
 }
 
 define half @extract_v4f16_0(<4 x half> %a, i32 %c) {
-; CHECK-LABEL: extract_v4f16_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v4f16_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <4 x half> %a, i32 0
   ret half %d
@@ -2149,11 +2159,16 @@ entry:
 }
 
 define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) {
-; CHECK-LABEL: extract_v2i32_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v2i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <2 x i32> %a, i32 0
   ret i32 %d
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 5ec30b6e8a667..e8194b9bd9b27 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4378,7 +4378,7 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -4415,7 +4415,7 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -6393,7 +6393,7 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -6439,7 +6439,7 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7375,7 +7375,7 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7395,7 +7395,7 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7602,7 +7602,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7637,7 +7637,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8124,7 +8124,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8175,7 +8175,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 9d165556f1c73..c1ea891bc86e7 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -301,17 +301,28 @@ define float @exp10_f32(float %x) {
 }
 
 define <1 x float> @exp10_v1f32(<1 x float> %x) {
-; CHECK-LABEL: exp10_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl exp10f
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $d0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; SDAG-LABEL: exp10_v1f32:
+; SDAG:       // %bb.0:
+; SDAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    .cfi_offset w30, -16
+; SDAG-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SDAG-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; SDAG-NEXT:    bl exp10f
+; SDAG-NEXT:    // kill: def $s0 killed $s0 def $d0
+; SDAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: exp10_v1f32:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -16
+; GISEL-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; GISEL-NEXT:    bl exp10f
+; GISEL-NEXT:    // kill: def $s0 killed $s0 def $d0
+; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISEL-NEXT:    ret
   %r = call <1 x float> @llvm.exp10.v1f32(<1 x float> %x)
   ret <1 x float> %r
 }
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index eded13a6b3669..c158d8ad93b05 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -655,9 +655,7 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; CHECKO0-NEXT:    // implicit-def: $d2
 ; CHECKO0-NEXT:    fmov s2, w8
 ; CHECKO0-NEXT:    ldr d0, [x0]
-; CHECKO0-NEXT:    // implicit-def: $q1
-; CHECKO0-NEXT:    fmov d1, d0
-; CHECKO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT:    fmov s1, s0
 ; CHECKO0-NEXT:    fmov w8, s1
 ; CHECKO0-NEXT:    fmov s1, w8
 ; CHECKO0-NEXT:    // kill: def $d1 killed $s1
@@ -727,9 +725,7 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; GISELO0-NEXT:    // implicit-def: $d2
 ; GISELO0-NEXT:    fmov s2, w8
 ; GISELO0-NEXT:    ldr d0, [x0]
-; GISELO0-NEXT:    // implicit-def: $q1
-; GISELO0-NEXT:    fmov d1, d0
-; GISELO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT:    fmov s1, s0
 ; GISELO0-NEXT:    fmov w8, s1
 ; GISELO0-NEXT:    fmov s1, w8
 ; GISELO0-NEXT:    // kill: def $d1 killed $s1
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index 4a1c50b67ed7b..28a8f4303765b 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -51,7 +51,6 @@ define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 1652eb70b0625..9827cb3526f99 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -595,8 +595,6 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: shl_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
@@ -773,8 +771,6 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: ashr_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
@@ -947,8 +943,6 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: lshr_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 0fe1ef5039929..3a9f12b838702 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -167,11 +167,16 @@ define void @store_v16i16(<16 x i16> %a, ptr %ptr){
 }
 
 define void @store_v1i32(<1 x i32> %a, ptr %ptr){
-; CHECK-LABEL: store_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: store_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
     store <1 x i32> %a, ptr %ptr
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 63e26a25f4e27..77483ebb2235c 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -930,85 +930,195 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-GI-LABEL: vector_to_vector_cast:
 ; CHECK-GI:       ; %bb.0:
 ; CHECK-GI-NEXT:    sub sp, sp, #16
-; CHECK-GI-NEXT:    umov.b w10, v0[1]
-; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
 ; CHECK-GI-NEXT:    mov d1, v0[1]
-; CHECK-GI-NEXT:    umov.b w8, v0[0]
-; CHECK-GI-NEXT:    umov.b w11, v0[0]
-; CHECK-GI-NEXT:    umov.b w12, v0[2]
-; CHECK-GI-NEXT:    umov.b w13, v0[2]
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    umov.b w14, v0[2]
 ; CHECK-GI-NEXT:    umov.b w15, v0[3]
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
 ; CHECK-GI-NEXT:    umov.b w16, v0[4]
-; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    umov.b w17, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w9, w9, #0x1
-; CHECK-GI-NEXT:    bfi w8, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v1[1]
-; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    bfi w11, w9, #1, #31
-; CHECK-GI-NEXT:    umov.b w9, v1[0]
-; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #2
-; CHECK-GI-NEXT:    umov.b w12, v1[2]
+; CHECK-GI-NEXT:    umov.b w0, v1[1]
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w8, v1[0]
+; CHECK-GI-NEXT:    umov.b w10, v1[2]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w11, w11, w13, lsl #2
-; CHECK-GI-NEXT:    umov.b w13, v0[5]
+; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
+; CHECK-GI-NEXT:    umov.b w14, v1[3]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
 ; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #3
-; CHECK-GI-NEXT:    umov.b w15, v1[3]
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
+; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[4]
+; CHECK-GI-NEXT:    umov.b w11, v0[6]
+; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    bfi w9, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    and w17, w17, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
 ; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #4
-; CHECK-GI-NEXT:    umov.b w16, v1[4]
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
+; CHECK-GI-NEXT:    umov.b w10, v1[5]
+; CHECK-GI-NEXT:    umov.b w16, v1[6]
+; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
+; CHECK-GI-NEXT:    umov.b w17, v0[4]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #2
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
+; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w12, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[1]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
+; CHECK-GI-NEXT:    umov.b w10, v0[0]
+; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    umov.b w12, v0[6]
+; CHECK-GI-NEXT:    strb w11, [sp, #8]
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[3]
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
+; CHECK-GI-NEXT:    umov.b w14, v0[7]
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
+; CHECK-GI-NEXT:    and w11, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #9]
+; CHECK-GI-NEXT:    umov.b w15, v0[4]
+; CHECK-GI-NEXT:    and w8, w12, #0x1
+; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
+; CHECK-GI-NEXT:    and w9, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[1]
+; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[0]
+; CHECK-GI-NEXT:    and w13, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[2]
+; CHECK-GI-NEXT:    umov.b w17, v0[3]
+; CHECK-GI-NEXT:    and w14, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w12, v0[4]
-; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #5
-; CHECK-GI-NEXT:    umov.b w13, v1[5]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[3]
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
+; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
+; CHECK-GI-NEXT:    and w11, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
+; CHECK-GI-NEXT:    and w10, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[4]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    umov.b w16, v0[3]
+; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
+; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    umov.b w17, v0[0]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[0]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w15, lsl #3
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #6
-; CHECK-GI-NEXT:    umov.b w10, v1[6]
-; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w16, lsl #4
-; CHECK-GI-NEXT:    umov.b w16, v0[5]
-; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
+; CHECK-GI-NEXT:    and w14, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #5
-; CHECK-GI-NEXT:    umov.b w13, v0[6]
-; CHECK-GI-NEXT:    orr w11, w11, w12, lsl #4
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    and w17, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[5]
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
+; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w17, v0[6]
+; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
 ; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #6
-; CHECK-GI-NEXT:    and w10, w16, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #7
-; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #5
-; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
+; CHECK-GI-NEXT:    and w16, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
+; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w17, #0x1
 ; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
-; CHECK-GI-NEXT:    and w11, w13, #0x1
-; CHECK-GI-NEXT:    strb w8, [sp, #8]
-; CHECK-GI-NEXT:    orr w8, w10, w11, lsl #6
-; CHECK-GI-NEXT:    ldr b0, [sp, #8]
-; CHECK-GI-NEXT:    strb w9, [sp, #9]
-; CHECK-GI-NEXT:    and w9, w15, #0x1
-; CHECK-GI-NEXT:    ldr b1, [sp, #9]
-; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
 ; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
-; CHECK-GI-NEXT:    strb w8, [sp, #10]
-; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
+; CHECK-GI-NEXT:    and w12, w15, #0x1
 ; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    strb w8, [sp, #12]
-; CHECK-GI-NEXT:    strb w8, [sp, #13]
+; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
+; CHECK-GI-NEXT:    and w10, w0, #0x1
+; CHECK-GI-NEXT:    strb w9, [sp, #13]
+; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
 ; CHECK-GI-NEXT:    strb w8, [sp, #14]
-; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    strb w9, [sp, #15]
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index bd68b213ec988..1164e02a16c9e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -79,10 +79,11 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: add_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fadd h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 1906ca9defa40..1d295a30a994b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 152eb66ebcdfe..ee2af110c84cd 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
index a1b7118d8080d..be61f9b521795 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index d5f999add22c2..300081dc3ec40 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
index 719cac8f33028..e735f670ced0c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
@@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index e22a5a4af4fae..e1b21705c95f3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -5,11 +5,18 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -72,9 +79,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    fmul h1, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h2
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -465,6 +475,3 @@ declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index 5fd705b07ca3b..2429cf4b4597a 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -5,11 +5,18 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -44,17 +51,20 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -105,7 +115,8 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -113,10 +124,12 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
   ret half %r
@@ -134,7 +147,8 @@ define float @mul_S(<4 x float> %bin.rdx)  {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
@@ -206,7 +220,8 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -215,10 +230,12 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
   ret half %r
@@ -238,7 +255,8 @@ define float @mul_2S(<8 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
@@ -271,8 +289,9 @@ define float @mul_S_init_42(<4 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
@@ -338,8 +357,10 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
@@ -354,14 +375,18 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-FP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v2.4h
 ; CHECK-GI-FP16-NEXT:    fmul v1.4h, v1.4h, v3.4h
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov h3, v1.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h4, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h2, h1, v1.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h1, h3, v1.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h4, h0
-; CHECK-GI-FP16-NEXT:    fmul h1, h2, h1
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov h6, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov h7, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h2, h3, h4
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h5
+; CHECK-GI-FP16-NEXT:    fmul h3, h6, h7
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
@@ -389,8 +414,10 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -414,8 +441,10 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -442,10 +471,12 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    mov d3, v2.d[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
 ; CHECK-GI-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT:    fmul s1, s1, s4
+; CHECK-GI-NEXT:    mov s3, v2.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    fmul s1, s2, v2.s[1]
+; CHECK-GI-NEXT:    fmul s1, s2, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
@@ -471,8 +502,10 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -523,8 +556,10 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b)
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s1, s0, s1
 ; CHECK-GI-NEXT:    fmul s0, s1, s0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index d5c040e09945b..0806f7da5c89c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -57,11 +57,16 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 }
 
 define i32 @test_v1i32(<1 x i32> %a) nounwind {
-; CHECK-LABEL: test_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll
index 53456c4c81ccc..602643264e7be 100644
--- a/llvm/test/CodeGen/AArch64/vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll
@@ -755,13 +755,20 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-i32-NEXT:    ret
 ;
-; CHECK-i64-LABEL: lrint_v1f32:
-; CHECK-i64:       // %bb.0:
-; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-i64-NEXT:    frintx s0, s0
-; CHECK-i64-NEXT:    fcvtzs x8, s0
-; CHECK-i64-NEXT:    fmov d0, x8
-; CHECK-i64-NEXT:    ret
+; CHECK-i64-SD-LABEL: lrint_v1f32:
+; CHECK-i64-SD:       // %bb.0:
+; CHECK-i64-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-SD-NEXT:    frintx s0, s0
+; CHECK-i64-SD-NEXT:    fcvtzs x8, s0
+; CHECK-i64-SD-NEXT:    fmov d0, x8
+; CHECK-i64-SD-NEXT:    ret
+;
+; CHECK-i64-GI-LABEL: lrint_v1f32:
+; CHECK-i64-GI:       // %bb.0:
+; CHECK-i64-GI-NEXT:    frintx s0, s0
+; CHECK-i64-GI-NEXT:    fcvtzs x8, s0
+; CHECK-i64-GI-NEXT:    fmov d0, x8
+; CHECK-i64-GI-NEXT:    ret
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
   ret <1 x iXLen> %a
 }
@@ -1328,7 +1335,3 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) {
   ret <32 x iXLen> %a
 }
 declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-i32-GI: {{.*}}
-; CHECK-i64-GI: {{.*}}
-; CHECK-i64-SD: {{.*}}

>From 14c11e4bcb262496981a2948af11a3f9e9de23ef Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang at salesforce.com>
Date: Wed, 11 Jun 2025 09:39:31 +0200
Subject: [PATCH 8/9] [coro][NFC] Move switch basic block to beginning of
 coroutine (#143626)

This makes the code flow when reading the LLVM IR of a split coroutine a
bit more natural. It does not change anything from an end-user
perspective but makes debugging the CoroSplit pass slightly easier.
---
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index f9a6c70fedc2d..cebe44581b061 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -703,6 +703,7 @@ void coro::BaseCloner::replaceEntryBlock() {
     auto *SwitchBB =
         cast<BasicBlock>(VMap[Shape.SwitchLowering.ResumeEntryBlock]);
     Builder.CreateBr(SwitchBB);
+    SwitchBB->moveAfter(Entry);
     break;
   }
   case coro::ABI::Async:

>From b5b074273c35072f76a0683ea155e024c411ae04 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 4 Jun 2025 00:54:37 -0500
Subject: [PATCH 9/9] [flang][OpenMP] Map basic `local` specifiers to `private`
 clauses

Starts the effort to map `do concurrent` locality specifiers to OpenMP
clauses. This PR adds support for basic specifiers (no `init` or `copy`
regions yet).
---
 .../OpenMP/DoConcurrentConversion.cpp         | 55 ++++++++++++++++++-
 .../locality_specifiers_simple.mlir           | 48 ++++++++++++++++
 2 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 0fdb302fe10ca..283c3052c166c 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
+#include "flang/Support/OpenMP-utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
@@ -308,10 +310,47 @@ class DoConcurrentConversion
               fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {
+    mlir::omp::WsloopOperands wsloopClauseOps;
+
+    // For `local` (and `local_init`) opernads, emit corresponding `private`
+    // clauses and attach these clauses to the workshare loop.
+    if (!loop.getLocalOperands().empty())
+      for (auto [op, sym, arg] : llvm::zip_equal(
+               loop.getLocalOperands(),
+               loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+               loop.getRegionLocalArgs())) {
+        auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom<
+            fir::LocalitySpecifierOp>(loop, sym);
+        if (localizer.getLocalitySpecifierType() ==
+            fir::LocalitySpecifierType::LocalInit)
+          TODO(localizer.getLoc(),
+               "local_init conversion is not supported yet");
+
+        if (!localizer.getInitRegion().empty())
+          TODO(localizer.getLoc(),
+               "non-empty `init` regions are not supported yet");
+
+        auto oldIP = rewriter.saveInsertionPoint();
+        rewriter.setInsertionPointAfter(localizer);
+        auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>(
+            localizer.getLoc(), sym.getLeafReference().str() + ".omp",
+            localizer.getTypeAttr().getValue(),
+            mlir::omp::DataSharingClauseType::Private);
+        rewriter.restoreInsertionPoint(oldIP);
+
+        wsloopClauseOps.privateVars.push_back(op);
+        wsloopClauseOps.privateSyms.push_back(
+            mlir::SymbolRefAttr::get(privatizer));
+      }
 
-    auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(loop.getLoc());
+    auto wsloopOp =
+        rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps);
     wsloopOp.setComposite(isComposite);
-    rewriter.createBlock(&wsloopOp.getRegion());
+
+    Fortran::common::openmp::EntryBlockArgs wsloopArgs;
+    wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
+    Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
+                                           wsloopOp.getRegion());
 
     auto loopNestOp =
         rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps);
@@ -324,6 +363,18 @@ class DoConcurrentConversion
     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
     rewriter.create<mlir::omp::YieldOp>(loop->getLoc());
 
+    // `local` region arguments are transferred/cloned from the `do concurrent`
+    // loop to the loopnest op when the region is cloned above. Instead, these
+    // region arguments should be on the workshare loop's region.
+    for (auto [wsloopArg, loopNestArg] :
+         llvm::zip_equal(wsloopOp.getRegion().getArguments(),
+                         loopNestOp.getRegion().getArguments().drop_front(
+                             clauseOps.loopLowerBounds.size())))
+      rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+
+    for (unsigned i = 0; i < loop.getLocalVars().size(); ++i)
+      loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
+
     return loopNestOp;
   }
 
diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir
new file mode 100644
index 0000000000000..160c1df040680
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir
@@ -0,0 +1,48 @@
+// Tests mapping `local` locality specifier to `private` clauses for a simple
+// case (not `init` or `copy` regions).
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+fir.local {type = local} @_QFlocal_spec_translationElocal_var_private_f32 : f32
+
+func.func @_QPlocal_spec_translation() {
+  %3 = fir.alloca f32 {bindc_name = "local_var", uniq_name = "_QFlocal_spec_translationElocal_var"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+  %c4_i32 = arith.constant 4 : index
+  %c11_i32 = arith.constant 11 : index
+  %c1 = arith.constant 1 : index
+
+  fir.do_concurrent {
+    %7 = fir.alloca i32 {bindc_name = "i"}
+    %8:2 = hlfir.declare %7 {uniq_name = "_QFlocal_spec_translationEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    fir.do_concurrent.loop (%arg0) = (%c4_i32) to (%c11_i32) step (%c1)
+      local(@_QFlocal_spec_translationElocal_var_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
+      %9 = fir.convert %arg0 : (index) -> i32
+      fir.store %9 to %8#0 : !fir.ref<i32>
+
+      %10:2 = hlfir.declare %arg1 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+      %cst = arith.constant 4.200000e+01 : f32
+      hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
+    }
+  }
+  return
+}
+
+// CHECK: omp.private {type = private} @[[PRIVATIZER:.*local_spec_translationElocal_var.*.omp]] : f32
+
+// CHECK: func.func @_QPlocal_spec_translation
+// CHECK:   %[[LOCAL_VAR:.*]] = fir.alloca f32 {bindc_name = "local_var", {{.*}}}
+// CHECK:   %[[LOCAL_VAR_DECL:.*]]:2 = hlfir.declare %[[LOCAL_VAR]]
+// CHECK:   omp.parallel {
+// CHECK:     omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_VAR_DECL]]#0 -> %[[LOCAL_ARG:.*]] : !fir.ref<f32>) {
+// CHECK:       omp.loop_nest {{.*}} {
+// CHECK:       %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_ARG]]
+// CHECK:       %[[C42:.*]] = arith.constant
+// CHECK:       hlfir.assign %[[C42]] to %[[PRIV_DECL]]#0
+// CHECK:       omp.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   omp.terminator
+// CHECK: }