[llvm] [clang] [clang-tools-extra] [MCP] Enhance MCP copy Instruction removal for special case (PR #70778)

Thu Nov 16 07:14:34 PST 2023

https://github.com/LWenH updated https://github.com/llvm/llvm-project/pull/70778

>From a42f48a44c614f2c996f3f4cb0561e2f7ab35d6f Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Tue, 31 Oct 2023 16:33:41 +0800
Subject: [PATCH 1/8] add pre commit test for later mcp patch

---
 llvm/test/CodeGen/RISCV/machine-cp.mir | 34 ++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/machine-cp.mir b/llvm/test/CodeGen/RISCV/machine-cp.mir
index f3674f89cd918b4..9c04abf492a1475 100644
--- a/llvm/test/CodeGen/RISCV/machine-cp.mir
+++ b/llvm/test/CodeGen/RISCV/machine-cp.mir
@@ -9,6 +9,10 @@
   entry:
     ret void
   }
+  define void @bar() {
+  entry:
+    ret void
+  }
 ...
 ---
 name:            foo
@@ -21,6 +25,7 @@ body:             |
     ; RV32-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7
     ; RV32-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
     ; RV32-NEXT: PseudoRET implicit $v28
+    ;
     ; RV64-LABEL: name: foo
     ; RV64: liveins: $v28_v29_v30, $v8_v9, $v1
     ; RV64-NEXT: {{  $}}
@@ -32,3 +37,32 @@ body:             |
     renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30
     PseudoRET implicit $v28
 ...
+---
+name:            bar
+body:             |
+  bb.0.entry:
+    liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31
+    ; RV32-LABEL: name: bar
+    ; RV32: liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: $v0 = COPY renamable $v8
+    ; RV32-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+    ; RV32-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
+    ; RV32-NEXT: $v0 = COPY killed renamable $v8
+    ; RV32-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
+    ;
+    ; RV64-LABEL: name: bar
+    ; RV64: liveins: $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x28, $x29, $x30, $x31
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: $v0 = COPY renamable $v8
+    ; RV64-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+    ; RV64-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
+    ; RV64-NEXT: $v0 = COPY killed renamable $v8
+    ; RV64-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
+    $v0 = COPY killed renamable $v9; example.cpp:14:22
+    $v0 = COPY renamable $v8; example.cpp:12:25
+    renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5, 1, implicit $vl, implicit $vtype; example.cpp:12:25
+    early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5, implicit $vl, implicit $vtype; example.cpp:9:22
+    $v0 = COPY killed renamable $v8; example.cpp:12:22
+    PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5, implicit $vl, implicit $vtype; example.cpp:12:22
+...

>From 516d95d53b00d1478aca2ad3f180eb6a3dc3d24b Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Tue, 31 Oct 2023 17:39:04 +0800
Subject: [PATCH 2/8] mcp: further enhance chance for reduncdant copy removal

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 25 +++++++++++++++++++++
 llvm/test/CodeGen/RISCV/machine-cp.mir      |  2 --
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index a032b31a1fc7c62..b0640b48121febd 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -719,6 +719,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName()
                     << "\n");
 
+  const MachineInstr *LastMI = nullptr;
   for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
     // Analyze copies (which don't overlap themselves).
     std::optional<DestSourcePair> CopyOperands =
@@ -735,6 +736,27 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         MCRegister Def = RegDef.asMCReg();
         MCRegister Src = RegSrc.asMCReg();
 
+        // Target may lost some opportunity to further remove the redundant
+        // copy instruction, consider the following sequence:
+        // L1: r0 = COPY r9     <- TrackMI
+        // L2: r0 = COPY r8     <- TrackMI
+        // L3: use r0           <- Remove L2 from MaybeDeadCopies
+        // L4: early-clobber r9 <- Invalid L2 from Tracker
+        // L5: r0 = COPY r8     <- Miss remove chance
+        // L6: use r0           <- Miss remove L5 chance
+        if (LastMI) {
+          std::optional<DestSourcePair> PrevCopyOperands =
+              isCopyInstr(*LastMI, *TII, UseCopyInstr);
+          if (PrevCopyOperands) {
+            Register PrevRegDef = PrevCopyOperands->Destination->getReg();
+            // We could remove the previous copy from tracker directly.
+            if (TRI->isSubRegisterEq(RegDef, PrevRegDef)) {
+              Tracker.invalidateRegister(PrevRegDef.asMCReg(), *TRI, *TII,
+                                         UseCopyInstr);
+            }
+          }
+        }
+
         // The two copies cancel out and the source of the first copy
         // hasn't been overridden, eliminate the second one. e.g.
         //  %ecx = COPY %eax
@@ -795,6 +817,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         }
 
         Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
+        LastMI = &MI;
 
         continue;
       }
@@ -874,6 +897,8 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // Any previous copy definition or reading the Defs is no longer available.
     for (MCRegister Reg : Defs)
       Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+
+    LastMI = &MI;
   }
 
   // If MBB doesn't have successors, delete the copies whose defs are not used.
diff --git a/llvm/test/CodeGen/RISCV/machine-cp.mir b/llvm/test/CodeGen/RISCV/machine-cp.mir
index 9c04abf492a1475..7523332a23c6839 100644
--- a/llvm/test/CodeGen/RISCV/machine-cp.mir
+++ b/llvm/test/CodeGen/RISCV/machine-cp.mir
@@ -48,7 +48,6 @@ body:             |
     ; RV32-NEXT: $v0 = COPY renamable $v8
     ; RV32-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
     ; RV32-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
-    ; RV32-NEXT: $v0 = COPY killed renamable $v8
     ; RV32-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
     ;
     ; RV64-LABEL: name: bar
@@ -57,7 +56,6 @@ body:             |
     ; RV64-NEXT: $v0 = COPY renamable $v8
     ; RV64-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
     ; RV64-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
-    ; RV64-NEXT: $v0 = COPY killed renamable $v8
     ; RV64-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
     $v0 = COPY killed renamable $v9; example.cpp:14:22
     $v0 = COPY renamable $v8; example.cpp:12:25

>From f2046a6d9f96e3d5b9dcf1648e065ccd9f9a9bad Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Fri, 3 Nov 2023 14:29:26 +0800
Subject: [PATCH 3/8] [MCP] address comment and make it more general

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 91 +++++++++++++------
 llvm/test/CodeGen/RISCV/machine-cp.mir        | 12 +--
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |  1 -
 3 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index b0640b48121febd..738b148df7b5b75 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -184,6 +184,56 @@ class CopyTracker {
     }
   }
 
+  /// Clobber \p Reg first, and then remove the corresponding COPY
+  /// record pair from the tracker. We need to locate and remove
+  /// the COPY instruction that defines \p Reg, as well as the
+  /// record in the tracker that make src defines \p Reg.
+  void eraseRegMIPair(MCRegister Reg, const TargetRegisterInfo &TRI,
+                      const TargetInstrInfo &TII, bool UseCopyInstr) {
+    for (MCRegUnit Unit : TRI.regunits(Reg)) {
+      auto I = Copies.find(Unit);
+
+      if (I != Copies.end()) {
+        // When we clobber the source of a copy, we need to clobber everything
+        // it defined.
+        markRegsUnavailable(I->second.DefRegs, TRI);
+        // When we clobber the destination of a copy, we need to clobber the
+        // whole register it defined.
+        if (MachineInstr *MI = I->second.MI) {
+          std::optional<DestSourcePair> CopyOperands =
+              isCopyInstr(*MI, TII, UseCopyInstr);
+
+          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+          MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+
+          markRegsUnavailable(Def, TRI);
+
+          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+            auto SrcCopy = Copies.find(SrcUnit);
+            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+              // Src only defined Reg, erase SrcCopy directly.
+              if (SrcCopy->second.DefRegs.size() == 1) {
+                Copies.erase(SrcCopy);
+              } else {
+                // If Src define multiple value, we only need
+                // to erase the Unit in DefRegs.
+                for (auto itr = SrcCopy->second.DefRegs.begin();
+                     itr != SrcCopy->second.DefRegs.end(); itr++) {
+                  if (*itr == Unit) {
+                    SrcCopy->second.DefRegs.erase(itr);
+                    break;
+                  }
+                }
+              }
+            }
+          }
+        }
+        // Now we can erase the copy.
+        Copies.erase(I);
+      }
+    }
+  }
+
   /// Add this copy's registers into the tracker's copy maps.
   void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI,
                  const TargetInstrInfo &TII, bool UseCopyInstr) {
@@ -719,7 +769,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
   LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName()
                     << "\n");
 
-  const MachineInstr *LastMI = nullptr;
   for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
     // Analyze copies (which don't overlap themselves).
     std::optional<DestSourcePair> CopyOperands =
@@ -736,27 +785,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         MCRegister Def = RegDef.asMCReg();
         MCRegister Src = RegSrc.asMCReg();
 
-        // Target may lost some opportunity to further remove the redundant
-        // copy instruction, consider the following sequence:
-        // L1: r0 = COPY r9     <- TrackMI
-        // L2: r0 = COPY r8     <- TrackMI
-        // L3: use r0           <- Remove L2 from MaybeDeadCopies
-        // L4: early-clobber r9 <- Invalid L2 from Tracker
-        // L5: r0 = COPY r8     <- Miss remove chance
-        // L6: use r0           <- Miss remove L5 chance
-        if (LastMI) {
-          std::optional<DestSourcePair> PrevCopyOperands =
-              isCopyInstr(*LastMI, *TII, UseCopyInstr);
-          if (PrevCopyOperands) {
-            Register PrevRegDef = PrevCopyOperands->Destination->getReg();
-            // We could remove the previous copy from tracker directly.
-            if (TRI->isSubRegisterEq(RegDef, PrevRegDef)) {
-              Tracker.invalidateRegister(PrevRegDef.asMCReg(), *TRI, *TII,
-                                         UseCopyInstr);
-            }
-          }
-        }
-
         // The two copies cancel out and the source of the first copy
         // hasn't been overridden, eliminate the second one. e.g.
         //  %ecx = COPY %eax
@@ -806,7 +834,21 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         // %xmm2 = copy %xmm0
         // ...
         // %xmm2 = copy %xmm9
-        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr);
+
+        // While we do need to clobber the register here, simply clobbering it
+        // is not sufficient. We also need to remove the COPY record pair for
+        // 'Def' in the tracker. Failing to do so might cause the target to miss
+        // some opportunities to eliminate redundant copy instructions.
+
+        // Consider the following sequence:
+        // L1: r0 = COPY r9     <- TrackMI
+        // L2: r0 = COPY r8     <- TrackMI
+        // L3: use r0           <- Remove L2 from MaybeDeadCopies
+        // L4: early-clobber r9 <- Invalid L2 from Tracker
+        // L5: r0 = COPY r8     <- Miss remove chance
+        // L6: use r0           <- Miss remove L5 chance
+        Tracker.eraseRegMIPair(Def, *TRI, *TII, UseCopyInstr);
+
         for (const MachineOperand &MO : MI.implicit_operands()) {
           if (!MO.isReg() || !MO.isDef())
             continue;
@@ -817,8 +859,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         }
 
         Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
-        LastMI = &MI;
-
         continue;
       }
     }
@@ -898,7 +938,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     for (MCRegister Reg : Defs)
       Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
 
-    LastMI = &MI;
   }
 
   // If MBB doesn't have successors, delete the copies whose defs are not used.
diff --git a/llvm/test/CodeGen/RISCV/machine-cp.mir b/llvm/test/CodeGen/RISCV/machine-cp.mir
index 7523332a23c6839..14ae069e5ef707b 100644
--- a/llvm/test/CodeGen/RISCV/machine-cp.mir
+++ b/llvm/test/CodeGen/RISCV/machine-cp.mir
@@ -57,10 +57,10 @@ body:             |
     ; RV64-NEXT: renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5 /* e32 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
     ; RV64-NEXT: early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5 /* e32 */, implicit $vl, implicit $vtype
     ; RV64-NEXT: PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5 /* e32 */, implicit $vl, implicit $vtype
-    $v0 = COPY killed renamable $v9; example.cpp:14:22
-    $v0 = COPY renamable $v8; example.cpp:12:25
-    renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5, 1, implicit $vl, implicit $vtype; example.cpp:12:25
-    early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5, implicit $vl, implicit $vtype; example.cpp:9:22
-    $v0 = COPY killed renamable $v8; example.cpp:12:22
-    PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5, implicit $vl, implicit $vtype; example.cpp:12:22
+    $v0 = COPY killed renamable $v9
+    $v0 = COPY renamable $v8
+    renamable $v14m2 = PseudoVLE32_V_M2_MASK undef renamable $v14m2, renamable $x15, $v0, -1, 5, 1, implicit $vl, implicit $vtype
+    early-clobber renamable $v9 = PseudoVMSLE_VI_M2 killed renamable $v10m2, -1, -1, 5, implicit $vl, implicit $vtype
+    $v0 = COPY killed renamable $v8
+    PseudoVSE32_V_M2_MASK killed renamable $v14m2, renamable $x9, $v0, -1, 5, implicit $vl, implicit $vtype
 ...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index d9958f4aae35003..5407eadb160bdef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmv1r.v v0, v1
 ; CHECK-NEXT:    vmflt.vf v1, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma

>From a358989dbb8177da32f496c3ef6a2d889f109fec Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Fri, 3 Nov 2023 14:48:42 +0800
Subject: [PATCH 4/8] fix code format issue

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 738b148df7b5b75..0685bf1f08031e5 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -937,7 +937,6 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     // Any previous copy definition or reading the Defs is no longer available.
     for (MCRegister Reg : Defs)
       Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
-
   }
 
   // If MBB doesn't have successors, delete the copies whose defs are not used.

>From ffd7b77171883670b5778db80b0f41bf81b709b1 Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Sat, 4 Nov 2023 22:51:13 +0800
Subject: [PATCH 5/8] [mcp] fix x86 and amdgpu backend tests inconsistent
 behavior

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 30 ++++++++++---------
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |  2 --
 llvm/test/CodeGen/X86/shift-i128.ll           |  1 -
 llvm/test/CodeGen/X86/shift-i256.ll           |  1 -
 .../vector-interleaved-load-i16-stride-7.ll   |  2 +-
 .../vector-interleaved-load-i8-stride-8.ll    |  1 -
 .../X86/wide-scalar-shift-legalization.ll     | 15 ++--------
 7 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 0685bf1f08031e5..4972bcdb250501c 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -185,9 +185,9 @@ class CopyTracker {
   }
 
   /// Clobber \p Reg first, and then remove the corresponding COPY
-  /// record pair from the tracker. We need to locate and remove
+  /// record pair from the tracker's copy maps. We need to locate and remove
   /// the COPY instruction that defines \p Reg, as well as the
-  /// record in the tracker that make src defines \p Reg.
+  /// record that make src defines \p Reg.
   void eraseRegMIPair(MCRegister Reg, const TargetRegisterInfo &TRI,
                       const TargetInstrInfo &TII, bool UseCopyInstr) {
     for (MCRegUnit Unit : TRI.regunits(Reg)) {
@@ -208,27 +208,29 @@ class CopyTracker {
 
           markRegsUnavailable(Def, TRI);
 
+          // At this point, we need to locate the record in copy maps that use
+          // Src to define Def, and remove them from Tracker.
           for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
             auto SrcCopy = Copies.find(SrcUnit);
             if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
-              // Src only defined Reg, erase SrcCopy directly.
-              if (SrcCopy->second.DefRegs.size() == 1) {
-                Copies.erase(SrcCopy);
-              } else {
-                // If Src define multiple value, we only need
-                // to erase the Unit in DefRegs.
-                for (auto itr = SrcCopy->second.DefRegs.begin();
-                     itr != SrcCopy->second.DefRegs.end(); itr++) {
-                  if (*itr == Unit) {
-                    SrcCopy->second.DefRegs.erase(itr);
-                    break;
+              // If Src define multiple values, we only need
+              // to erase the such record in DefRegs.
+              for (auto itr = SrcCopy->second.DefRegs.begin();
+                   itr != SrcCopy->second.DefRegs.end(); itr++) {
+                if (*itr == Def) {
+                  SrcCopy->second.DefRegs.erase(itr);
+                  // If DefReg becomes empty after removal, we can directly
+                  // remove SrcCopy from the tracker's copy maps.
+                  if (!SrcCopy->second.DefRegs.size()) {
+                    Copies.erase(SrcCopy);
                   }
+                  break;
                 }
               }
             }
           }
         }
-        // Now we can erase the copy.
+        // Now we can erase the copy that define Reg.
         Copies.erase(I);
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 2a1488652d887a4..cb197fb2465a48e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -6329,7 +6329,6 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s42
 ; GFX8-NEXT:    v_and_b32_e32 v24, 1, v20
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 7, s24
-; GFX8-NEXT:    v_mov_b32_e32 v25, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x60
 ; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[24:27]
@@ -6395,7 +6394,6 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s42
 ; GFX8-NEXT:    v_and_b32_e32 v10, 1, v23
 ; GFX8-NEXT:    v_mov_b32_e32 v13, 0
-; GFX8-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s43
 ; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[10:13]
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 5, s24
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 1fe8d834dbcddb1..cb9c54ae495e263 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    shrl %cl, %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %esi, 28(%ecx)
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 0e4e706669300c3..e1466aebf422589 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; CHECK-NEXT:    movl 28(%esp,%ebp), %edx
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 0771fcea0714cda..584d96270ef0476 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -14979,7 +14979,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm12, %ymm7
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 78a2729caf5a4cd..5d707790f1c1e66 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -11289,7 +11289,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm12, %xmm31
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 24475360cbbc46a..96d4d72c773239a 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -1845,7 +1845,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
@@ -2485,7 +2484,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebx), %edx
@@ -3129,7 +3127,6 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
@@ -3562,7 +3559,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
@@ -4197,7 +4193,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -4879,7 +4874,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%r10), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r12, %rsi
@@ -5534,7 +5528,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -6233,7 +6226,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
@@ -6872,7 +6864,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -7360,9 +7351,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X86: {{.*}}
-; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
 ; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}
+; X86: {{.*}}
+; X86-NO-SHLD: {{.*}}
+; X86-SHLD: {{.*}}

>From 6fcd047d51acb4716aed9a9953904f7978f71d73 Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Mon, 6 Nov 2023 21:40:07 +0800
Subject: [PATCH 6/8] [mcp] address comments and reuse the clobberRegister
 function

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 73 +++++++--------------
 1 file changed, 25 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 4972bcdb250501c..72aa8fdc768b0b8 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -163,36 +163,10 @@ class CopyTracker {
 
   /// Clobber a single register, removing it from the tracker's copy maps.
   void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
-                       const TargetInstrInfo &TII, bool UseCopyInstr) {
+                       const TargetInstrInfo &TII, bool UseCopyInstr,
+                       bool CleanUp = false) {
     for (MCRegUnit Unit : TRI.regunits(Reg)) {
       auto I = Copies.find(Unit);
-      if (I != Copies.end()) {
-        // When we clobber the source of a copy, we need to clobber everything
-        // it defined.
-        markRegsUnavailable(I->second.DefRegs, TRI);
-        // When we clobber the destination of a copy, we need to clobber the
-        // whole register it defined.
-        if (MachineInstr *MI = I->second.MI) {
-          std::optional<DestSourcePair> CopyOperands =
-              isCopyInstr(*MI, TII, UseCopyInstr);
-          markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
-                              TRI);
-        }
-        // Now we can erase the copy.
-        Copies.erase(I);
-      }
-    }
-  }
-
-  /// Clobber \p Reg first, and then remove the corresponding COPY
-  /// record pair from the tracker's copy maps. We need to locate and remove
-  /// the COPY instruction that defines \p Reg, as well as the
-  /// record that make src defines \p Reg.
-  void eraseRegMIPair(MCRegister Reg, const TargetRegisterInfo &TRI,
-                      const TargetInstrInfo &TII, bool UseCopyInstr) {
-    for (MCRegUnit Unit : TRI.regunits(Reg)) {
-      auto I = Copies.find(Unit);
-
       if (I != Copies.end()) {
         // When we clobber the source of a copy, we need to clobber everything
         // it defined.
@@ -203,34 +177,37 @@ class CopyTracker {
           std::optional<DestSourcePair> CopyOperands =
               isCopyInstr(*MI, TII, UseCopyInstr);
 
-          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
           MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
 
           markRegsUnavailable(Def, TRI);
 
-          // At this point, we need to locate the record in copy maps that use
-          // Src to define Def, and remove them from Tracker.
-          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
-            auto SrcCopy = Copies.find(SrcUnit);
-            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
-              // If Src define multiple values, we only need
-              // to erase the such record in DefRegs.
-              for (auto itr = SrcCopy->second.DefRegs.begin();
-                   itr != SrcCopy->second.DefRegs.end(); itr++) {
-                if (*itr == Def) {
-                  SrcCopy->second.DefRegs.erase(itr);
-                  // If DefReg becomes empty after removal, we can directly
-                  // remove SrcCopy from the tracker's copy maps.
-                  if (!SrcCopy->second.DefRegs.size()) {
-                    Copies.erase(SrcCopy);
+          // If CleanUP flag is specified, we will also locate the record in the
+          // copy maps that use Src to define Def, and remove it from Tracker.
+          if (CleanUp) {
+            MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+            for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+              auto SrcCopy = Copies.find(SrcUnit);
+              if (SrcCopy != Copies.end() &&
+                  SrcCopy->second.LastSeenUseInCopy) {
+                // If SrcCopy defines multiple values, we only need
+                // to erase the record for Def in DefRegs.
+                for (auto itr = SrcCopy->second.DefRegs.begin();
+                     itr != SrcCopy->second.DefRegs.end(); itr++) {
+                  if (*itr == Def) {
+                    SrcCopy->second.DefRegs.erase(itr);
+                    // If DefReg becomes empty after removal, we can directly
+                    // remove SrcCopy from the tracker's copy maps.
+                    if (SrcCopy->second.DefRegs.empty()) {
+                      Copies.erase(SrcCopy);
+                    }
+                    break;
                   }
-                  break;
                 }
               }
             }
           }
         }
-        // Now we can erase the copy that define Reg.
+        // Now we can erase the copy.
         Copies.erase(I);
       }
     }
@@ -849,7 +826,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         // L4: early-clobber r9 <- Invalid L2 from Tracker
         // L5: r0 = COPY r8     <- Miss remove chance
         // L6: use r0           <- Miss remove L5 chance
-        Tracker.eraseRegMIPair(Def, *TRI, *TII, UseCopyInstr);
+        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr, /*CleanUp=*/true);
 
         for (const MachineOperand &MO : MI.implicit_operands()) {
           if (!MO.isReg() || !MO.isDef())
@@ -857,7 +834,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
           MCRegister Reg = MO.getReg().asMCReg();
           if (!Reg)
             continue;
-          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr, /*CleanUp=*/true);
         }
 
         Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);

>From d7a892186739729dc9eb1a92302b7fef5a70805b Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Mon, 6 Nov 2023 22:13:20 +0800
Subject: [PATCH 7/8] fix code format issue

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 72aa8fdc768b0b8..cb6d35e2106b73b 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -826,7 +826,8 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         // L4: early-clobber r9 <- Invalid L2 from Tracker
         // L5: r0 = COPY r8     <- Miss remove chance
         // L6: use r0           <- Miss remove L5 chance
-        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr, /*CleanUp=*/true);
+        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr,
+                                /*CleanUp=*/true);
 
         for (const MachineOperand &MO : MI.implicit_operands()) {
           if (!MO.isReg() || !MO.isDef())
@@ -834,7 +835,8 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
           MCRegister Reg = MO.getReg().asMCReg();
           if (!Reg)
             continue;
-          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr, /*CleanUp=*/true);
+          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr,
+                                  /*CleanUp=*/true);
         }
 
         Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);

>From 642d6bbd3f7cd3a1ec668b64ff830302972e8b2c Mon Sep 17 00:00:00 2001
From: LWenH <924105575 at qq.com>
Date: Thu, 16 Nov 2023 22:59:16 +0800
Subject: [PATCH 8/8] [mcp] make clobber register function more generize and
 adpat more test cases

---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 69 ++++++++-----------
 llvm/test/CodeGen/X86/shift-i128.ll           |  3 -
 .../X86/smulo-128-legalisation-lowering.ll    |  2 +-
 .../vector-interleaved-load-i16-stride-7.ll   |  2 +-
 .../vector-interleaved-load-i64-stride-7.ll   |  8 +--
 .../vector-interleaved-load-i8-stride-5.ll    |  3 +-
 .../vector-interleaved-load-i8-stride-7.ll    | 26 ++++---
 .../vector-interleaved-load-i8-stride-8.ll    |  3 +-
 .../vector-interleaved-store-i16-stride-7.ll  |  3 +-
 .../X86/wide-scalar-shift-legalization.ll     |  3 +-
 10 files changed, 52 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index cb6d35e2106b73b..abd6e64de0097b8 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -163,8 +163,7 @@ class CopyTracker {
 
   /// Clobber a single register, removing it from the tracker's copy maps.
   void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
-                       const TargetInstrInfo &TII, bool UseCopyInstr,
-                       bool CleanUp = false) {
+                       const TargetInstrInfo &TII, bool UseCopyInstr) {
     for (MCRegUnit Unit : TRI.regunits(Reg)) {
       auto I = Copies.find(Unit);
       if (I != Copies.end()) {
@@ -178,30 +177,37 @@ class CopyTracker {
               isCopyInstr(*MI, TII, UseCopyInstr);
 
           MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
 
           markRegsUnavailable(Def, TRI);
 
-          // If CleanUP flag is specified, we will also locate the record in the
-          // copy maps that use Src to define Def, and remove it from Tracker.
-          if (CleanUp) {
-            MCRegister Src = CopyOperands->Source->getReg().asMCReg();
-            for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
-              auto SrcCopy = Copies.find(SrcUnit);
-              if (SrcCopy != Copies.end() &&
-                  SrcCopy->second.LastSeenUseInCopy) {
-                // If SrcCopy defines multiple values, we only need
-                // to erase the record for Def in DefRegs.
-                for (auto itr = SrcCopy->second.DefRegs.begin();
-                     itr != SrcCopy->second.DefRegs.end(); itr++) {
-                  if (*itr == Def) {
-                    SrcCopy->second.DefRegs.erase(itr);
-                    // If DefReg becomes empty after removal, we can directly
-                    // remove SrcCopy from the tracker's copy maps.
-                    if (SrcCopy->second.DefRegs.empty()) {
-                      Copies.erase(SrcCopy);
-                    }
-                    break;
+          // Since we clobber the destination of a copy, the semantic of Src's
+          // "DefRegs" to contain Def is no longer effectual. We will also need
+          // to remove the record from the copy maps that indicates Src defined
+          // Def. Failing to do so might cause the target to miss some
+          // opportunities to further eliminate redundant copy instructions.
+          // Consider the following sequence during the
+          // ForwardCopyPropagateBlock procedure:
+          // L1: r0 = COPY r9     <- TrackMI
+          // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
+          // L3: use r0           <- Remove L2 from MaybeDeadCopies
+          // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+          // L5: r0 = COPY r8     <- Remove NopCopy
+          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+            auto SrcCopy = Copies.find(SrcUnit);
+            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+              // If SrcCopy defines multiple values, we only need
+              // to erase the record for Def in DefRegs.
+              for (auto itr = SrcCopy->second.DefRegs.begin();
+                   itr != SrcCopy->second.DefRegs.end(); itr++) {
+                if (*itr == Def) {
+                  SrcCopy->second.DefRegs.erase(itr);
+                  // If DefReg becomes empty after removal, we can directly
+                  // remove SrcCopy from the tracker's copy maps.
+                  if (SrcCopy->second.DefRegs.empty()) {
+                    Copies.erase(SrcCopy);
                   }
+                  break;
                 }
               }
             }
@@ -813,21 +819,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
         // %xmm2 = copy %xmm0
         // ...
         // %xmm2 = copy %xmm9
-
-        // While we do need to clobber the register here, simply clobbering it
-        // is not sufficient. We also need to remove the COPY record pair for
-        // 'Def' in the tracker. Failing to do so might cause the target to miss
-        // some opportunities to eliminate redundant copy instructions.
-
-        // Consider the following sequence:
-        // L1: r0 = COPY r9     <- TrackMI
-        // L2: r0 = COPY r8     <- TrackMI
-        // L3: use r0           <- Remove L2 from MaybeDeadCopies
-        // L4: early-clobber r9 <- Invalid L2 from Tracker
-        // L5: r0 = COPY r8     <- Miss remove chance
-        // L6: use r0           <- Miss remove L5 chance
-        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr,
-                                /*CleanUp=*/true);
+        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr);
 
         for (const MachineOperand &MO : MI.implicit_operands()) {
           if (!MO.isReg() || !MO.isDef())
@@ -835,8 +827,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
           MCRegister Reg = MO.getReg().asMCReg();
           if (!Reg)
             continue;
-          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr,
-                                  /*CleanUp=*/true);
+          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
         }
 
         Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index cb9c54ae495e263..4fbe05cd1b2f2f7 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -488,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    sarl %cl, %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %ebp, 28(%ecx)
@@ -622,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %edi
 ; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %ebp
 ; i686-NEXT:    movl 64(%esp,%ebp), %esi
-; i686-NEXT:    movl %edi, %ecx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %edi, %esi
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index abab313f4b12e73..b2b5bcc5b44b2ce 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, %esi
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 584d96270ef0476..a041467c634feb6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -15734,7 +15734,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm12
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm16, %ymm17
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm15, %ymm13
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 864597f14e320ac..7c777fb5a94b1ab 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -5534,7 +5534,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-ONLY-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm8, %zmm29, %zmm9
+; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm5, %zmm29, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
@@ -5979,7 +5979,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm8, %zmm29, %zmm9
+; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm29, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm8
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
@@ -6424,7 +6424,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQBW-SLOW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm8, %zmm29, %zmm9
+; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm5, %zmm29, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm4, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
@@ -6869,7 +6869,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQBW-FAST-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm4, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm8, %zmm29, %zmm9
+; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm5, %zmm29, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm4, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm1
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 29e3247e1451a5f..3213be8703f463d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -2476,7 +2476,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
 ; SSE-NEXT:    psllq $48, %xmm0
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm4
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm5, %xmm1
@@ -2533,7 +2532,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm7, %xmm0
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 3b13741aaf7d5b1..39875a96326dd98 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm9, %xmm7
 ; SSE-NEXT:    pand %xmm14, %xmm7
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm15
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, %xmm15
 ; SSE-NEXT:    pand %xmm14, %xmm15
 ; SSE-NEXT:    movdqa %xmm11, %xmm3
 ; SSE-NEXT:    pandn %xmm8, %xmm3
@@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm5, %xmm9
 ; SSE-NEXT:    pand %xmm13, %xmm9
 ; SSE-NEXT:    por %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm10, %xmm13
@@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
 ; SSE-NEXT:    por %xmm10, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
@@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm14, %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm14, %xmm3
 ; SSE-NEXT:    movdqa %xmm11, %xmm6
 ; SSE-NEXT:    pandn %xmm11, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm14, %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm14, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
@@ -9965,7 +9964,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $248, %ymm23, %ymm3, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm13, %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, %xmm14
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm8
@@ -9994,12 +9992,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %ymm5, %ymm25, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm11, %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm10, %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm14, %xmm7
+; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm0, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 5d707790f1c1e66..e4acb33faade0e5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm23
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm17, %zmm9
@@ -11301,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm11
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 473ac8a546f9048..0c0ecc5ea3c7505 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1343,10 +1343,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3]
 ; SSE-NEXT:    movdqa %xmm15, %xmm10
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm5, %xmm1
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2]
 ; SSE-NEXT:    andps %xmm8, %xmm1
 ; SSE-NEXT:    orps %xmm6, %xmm1
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 96d4d72c773239a..f84131dfc879701 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -5194,7 +5194,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -5205,7 +5205,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx