[llvm] [AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) (PR #89017)

Tue Apr 16 21:15:04 PDT 2024

https://github.com/dc03-work created https://github.com/llvm/llvm-project/pull/89017

This patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition.

This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.

>From 034a11a5d8e150858f35e3b722d51007d2177754 Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <dhruvc at nvidia.com>
Date: Wed, 10 Apr 2024 10:50:41 +0530
Subject: [PATCH] [AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr,
 x:fpr, i)

This patch adds a peephole optimization for codegen that is caused by
RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in
both the input and output registers. This can cause a generation of COPY
from FPR to GPR when, for example, the output register of the
G_EXTRACT_VECTOR_ELT is used in a branch condition.

This was noticed when looking at codegen differences between SDAG and GI
for the s1279 kernel in the TSVC benchmark.
---
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 46 ++++++++++
 llvm/test/CodeGen/AArch64/aarch64-mulv.ll     | 83 ++++++-------------
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  9 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    | 13 ++-
 llvm/test/CodeGen/AArch64/reduce-and.ll       | 28 +++----
 llvm/test/CodeGen/AArch64/reduce-or.ll        | 28 +++----
 llvm/test/CodeGen/AArch64/reduce-xor.ll       | 28 +++----
 7 files changed, 118 insertions(+), 117 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 22da7ddef98a2a..ef1ec90d739a80 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -61,6 +61,12 @@
 //   %6:fpr128 = IMPLICIT_DEF
 //   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
 //
+// 8.   %129:fpr32 = DUPi32 %167:fpr128, 3
+//      %173:gpr32 = COPY %129:fpr32
+//   ==>
+//      %173:gpr32 = UMOVvi32 %167:fpr128, 3
+//   Similar peephole for 64-bit moves.
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ExpandImm.h"
@@ -128,6 +134,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
   bool visitINSvi64lane(MachineInstr &MI);
   bool visitFMOVDr(MachineInstr &MI);
+  bool visitCOPY(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -690,6 +697,42 @@ bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
   return true;
 }
 
+bool AArch64MIPeepholeOpt::visitCOPY(MachineInstr &MI) {
+  // Optimize COPY of FPR extract into GPR regbank to UMOV
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  if (!Dst.isVirtual() || !Src.isVirtual())
+    return false;
+
+  auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
+                         const TargetRegisterClass *FPRRegClass, unsigned DUP,
+                         unsigned UMOV) {
+    if (MRI->getRegClassOrNull(Dst) != GPRRegClass ||
+        MRI->getRegClassOrNull(Src) != FPRRegClass)
+      return false;
+
+    MachineInstr *SrcMI = MRI->getUniqueVRegDef(Src);
+    if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI->hasOneUse(Src))
+      return false;
+
+    Register DupSrc = SrcMI->getOperand(1).getReg();
+    int64_t DupImm = SrcMI->getOperand(2).getImm();
+
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
+        .addReg(DupSrc)
+        .addImm(DupImm);
+    SrcMI->eraseFromParent();
+    MI.eraseFromParent();
+    return true;
+  };
+
+  return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
+                     AArch64::DUPi32, AArch64::UMOVvi32) ||
+         TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
+                     AArch64::DUPi32, AArch64::UMOVvi64);
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -771,6 +814,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       case AArch64::FMOVDr:
         Changed |= visitFMOVDr(MI);
         break;
+      case AArch64::COPY:
+        Changed |= visitCOPY(MI);
+        break;
       }
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index 7b7ca9d8ffc2db..b324b896529fd9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -25,22 +25,13 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
 declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>)
 
 define i8 @mulv_v2i8(<2 x i8> %a) {
-; CHECK-SD-LABEL: mulv_v2i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, v0.s[1]
-; CHECK-SD-NEXT:    fmov w9, s0
-; CHECK-SD-NEXT:    mul w0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mulv_v2i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mulv_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mul w0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a)
   ret i8 %arg1
@@ -230,22 +221,13 @@ entry:
 }
 
 define i16 @mulv_v2i16(<2 x i16> %a) {
-; CHECK-SD-LABEL: mulv_v2i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, v0.s[1]
-; CHECK-SD-NEXT:    fmov w9, s0
-; CHECK-SD-NEXT:    mul w0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mulv_v2i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mulv_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mul w0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a)
   ret i16 %arg1
@@ -372,22 +354,13 @@ entry:
 }
 
 define i32 @mulv_v2i32(<2 x i32> %a) {
-; CHECK-SD-LABEL: mulv_v2i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, v0.s[1]
-; CHECK-SD-NEXT:    fmov w9, s0
-; CHECK-SD-NEXT:    mul w0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mulv_v2i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mulv_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mul w0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
   ret i32 %arg1
@@ -424,10 +397,9 @@ define i32 @mulv_v4i32(<4 x i32> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mul w0, w9, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
@@ -452,10 +424,9 @@ define i32 @mulv_v8i32(<8 x i32> %a) {
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mul w0, w9, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 749d6071c98d7c..bdc39a214ec851 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1488,8 +1488,7 @@ define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: test_dup_v2i32_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s0, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    dup v0.4h, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1510,8 +1509,7 @@ define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: test_dup_v4i32_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    dup v0.8h, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1626,8 +1624,7 @@ define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: test_dup_v4i32_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s0, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    dup v0.4h, w8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index c6b2d07231bf86..8b82004388b095 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -983,13 +983,12 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3i32_0:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w9
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 62ad45b212967a..31502e452efebc 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -30,10 +30,9 @@ define i1 @test_redand_v2i1(<2 x i1> %a) {
 ; GISEL-LABEL: test_redand_v2i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w8, w9, w8
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
@@ -457,10 +456,9 @@ define i32 @test_redand_v2i32(<2 x i32> %a) {
 ; GISEL-LABEL: test_redand_v2i32:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and_result = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %and_result
@@ -480,10 +478,9 @@ define i32 @test_redand_v4i32(<4 x i32> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and_result = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %and_result
@@ -505,10 +502,9 @@ define i32 @test_redand_v8i32(<8 x i32> %a) {
 ; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and_result = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
   ret i32 %and_result
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 20c498d36fdea4..708668959f9fbf 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -30,10 +30,9 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
 ; GISEL-LABEL: test_redor_v2i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w8, w9, w8
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
@@ -459,10 +458,9 @@ define i32 @test_redor_v2i32(<2 x i32> %a) {
 ; GISEL-LABEL: test_redor_v2i32:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w0, w9, w8
 ; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %or_result
@@ -482,10 +480,9 @@ define i32 @test_redor_v4i32(<4 x i32> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w0, w9, w8
 ; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %or_result
@@ -507,10 +504,9 @@ define i32 @test_redor_v8i32(<8 x i32> %a) {
 ; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w0, w9, w8
 ; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
   ret i32 %or_result
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index b8ca99e003b627..a902c06711f2c5 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -27,10 +27,9 @@ define i1 @test_redxor_v2i1(<2 x i1> %a) {
 ; GISEL-LABEL: test_redxor_v2i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w8, w9, w8
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
@@ -448,10 +447,9 @@ define i32 @test_redxor_v2i32(<2 x i32> %a) {
 ; GISEL-LABEL: test_redxor_v2i32:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w0, w9, w8
 ; GISEL-NEXT:    ret
   %xor_result = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %xor_result
@@ -471,10 +469,9 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w0, w9, w8
 ; GISEL-NEXT:    ret
   %xor_result = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %xor_result
@@ -496,10 +493,9 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) {
 ; GISEL-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w0, w9, w8
 ; GISEL-NEXT:    ret
   %xor_result = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
   ret i32 %xor_result