[llvm] [MCP] Bug fix for reverted patch #70778 (PR #74239)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 3 04:53:50 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-backend-x86
Author: Vettel (LWenH)
<details>
<summary>Changes</summary>
This patch try to reimplement [#<!-- -->70778](https://github.com/llvm/llvm-project/pull/70778), and fix issue: https://github.com/llvm/llvm-project/issues/73512.
In [#<!-- -->70778](https://github.com/llvm/llvm-project/pull/70778) , when we clobbering a register (**Def**), we will also locate and remove the entry in the Copy Maps that indicates **Src** defined **Def**. If the "**DefRegs**" is being empty after removal, we will directly remove the corresponding **SrcCopy** entry from the Copy Maps.
However, since the **SrcCopy** may also have referred the **Src** as the destination of another copy instruction, directly removing it is unsafe. We need to enhance the criteria for removing empty records to ensure safety.
---
Patch is 26.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74239.diff
13 Files Affected:
- (modified) llvm/lib/CodeGen/MachineCopyPropagation.cpp (+40-2)
- (added) llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir (+26)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll (-1)
- (modified) llvm/test/CodeGen/X86/shift-i128.ll (-4)
- (modified) llvm/test/CodeGen/X86/shift-i256.ll (-1)
- (modified) llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll (+12-14)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll (+1-3)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll (+4-14)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index a032b31a1fc7c..51e944d0279f2 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -175,8 +175,46 @@ class CopyTracker {
if (MachineInstr *MI = I->second.MI) {
std::optional<DestSourcePair> CopyOperands =
isCopyInstr(*MI, TII, UseCopyInstr);
- markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
- TRI);
+
+ MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+ MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+
+ markRegsUnavailable(Def, TRI);
+
+ // Since we clobber the destination of a copy, the semantic of Src's
+ // "DefRegs" to contain Def is no longer effectual. We will also need
+ // to remove the record from the copy maps that indicates Src defined
+ // Def. Failing to do so might cause the target to miss some
+ // opportunities to further eliminate redundant copy instructions.
+ // Consider the following sequence during the
+ // ForwardCopyPropagateBlock procedure:
+ // L1: r0 = COPY r9 <- TrackMI
+ // L2: r0 = COPY r8 <- TrackMI (Remove r9 defined r0 from tracker)
+ // L3: use r0 <- Remove L2 from MaybeDeadCopies
+ // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+ // L5: r0 = COPY r8 <- Remove NopCopy
+ for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+ auto SrcCopy = Copies.find(SrcUnit);
+ if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+ // If SrcCopy defines multiple values, we only need
+ // to erase the record for Def in DefRegs.
+ for (auto itr = SrcCopy->second.DefRegs.begin();
+ itr != SrcCopy->second.DefRegs.end(); itr++) {
+ if (*itr == Def) {
+ SrcCopy->second.DefRegs.erase(itr);
+ // If DefReg becomes empty after removal, we can remove the
+ // SrcCopy from the tracker's copy maps. We only remove those
+ // entries solely record the Def is defined by Src. If an
+ // entry also contains the definition record of other Def'
+ // registers, it cannot be cleared.
+ if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
+ Copies.erase(SrcCopy);
+ }
+ break;
+ }
+ }
+ }
+ }
}
// Now we can erase the copy.
Copies.erase(I);
diff --git a/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir b/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
new file mode 100644
index 0000000000000..6e613243e38c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -march=amdgcn -mcpu=gfx900 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s
+
+# The MachineCopyPropagation Pass should not treat the subsequent
+# instruction "$sgpr2_sgpr3 = COPY $sgpr6_sgpr7" as a NopCopy.
+# For detailed information, please refer to issue 73512.
+---
+name: foo
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+
+ ; CHECK-LABEL: name: foo
+ ; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr0
+ ; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+ ; CHECK-NEXT: S_NOP 0, implicit $sgpr2_sgpr3
+ $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+ $sgpr0 = COPY $sgpr3
+ S_NOP 0, implicit-def $sgpr0
+ $sgpr3 = COPY killed $sgpr5
+ $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+ S_NOP 0, implicit $sgpr2_sgpr3
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index d9958f4aae350..5407eadb160bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vfabs.v v16, v24, v0.t
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vmv1r.v v0, v1
; CHECK-NEXT: vmflt.vf v1, v16, fa5, v0.t
; CHECK-NEXT: frflags a0
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 1fe8d834dbcdd..4fbe05cd1b2f2 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl %edx, %ecx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; i686-NEXT: shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT: movl %edx, %ecx
; i686-NEXT: shrl %cl, %esi
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %esi, 28(%ecx)
@@ -489,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; i686-NEXT: shrdl %cl, %esi, %ebx
-; i686-NEXT: movl %edx, %ecx
; i686-NEXT: sarl %cl, %ebp
; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; i686-NEXT: movl %ebp, 28(%ecx)
@@ -623,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: shll %cl, %edi
; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT: movl %ecx, %edi
; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %ebp
; i686-NEXT: movl 64(%esp,%ebp), %esi
-; i686-NEXT: movl %edi, %ecx
; i686-NEXT: # kill: def $cl killed $cl killed $ecx
; i686-NEXT: movl (%esp), %edi # 4-byte Reload
; i686-NEXT: shldl %cl, %edi, %esi
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 0e4e706669300..e1466aebf4225 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
; CHECK-NEXT: movl 28(%esp,%ebp), %edx
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index abab313f4b12e..b2b5bcc5b44b2 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %ecx, %esi
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 893bff29b21e4..74347f3881aa9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -14828,7 +14828,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7]
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7]
-; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1]
; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15]
@@ -15583,7 +15583,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7]
-; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm17
; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13
; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14
; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 7d9c056716cee..0db749af9a196 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -4196,7 +4196,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9
+; AVX512BW-NEXT: vpermt2q %zmm5, %zmm29, %zmm9
; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8
; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 4d5c8fe891e81..7947dd0ff373c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -2476,7 +2476,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
; SSE-NEXT: psllq $48, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm7, %xmm4
; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; SSE-NEXT: pandn %xmm5, %xmm1
@@ -2533,7 +2532,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa %xmm8, %xmm1
; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm7, %xmm0
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; SSE-NEXT: pandn %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 0e3f8121db1de..cf1b0f2bc9860 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm9, %xmm7
; SSE-NEXT: pand %xmm14, %xmm7
-; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm6, %xmm15
+; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm5, %xmm15
; SSE-NEXT: pand %xmm14, %xmm15
; SSE-NEXT: movdqa %xmm11, %xmm3
; SSE-NEXT: pandn %xmm8, %xmm3
@@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm5, %xmm9
; SSE-NEXT: pand %xmm13, %xmm9
; SSE-NEXT: por %xmm0, %xmm9
-; SSE-NEXT: movdqa %xmm6, %xmm3
; SSE-NEXT: movdqa %xmm6, %xmm0
; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: pandn %xmm10, %xmm13
@@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm6, %xmm2
; SSE-NEXT: por %xmm10, %xmm2
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
@@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pand %xmm14, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: movdqa %xmm14, %xmm3
; SSE-NEXT: movdqa %xmm11, %xmm6
; SSE-NEXT: pandn %xmm11, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pand %xmm0, %xmm5
+; SSE-NEXT: pand %xmm14, %xmm5
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
+; SSE-NEXT: pand %xmm14, %xmm3
; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
@@ -9965,7 +9964,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm6
; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3
-; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm14
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm8
@@ -9994,12 +9992,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm5, %ymm25, %ymm2
-; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5
; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm5
-; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7
+; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm7
; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 0c2df82fd1be5..f2133b9e42d30 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9
@@ -11289,7 +11288,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9
; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -11302,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11
+; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11
; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 5934b80893ce3..2e4f293587064 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1343,10 +1343,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3]
; SSE-NEXT: movdqa %xmm15, %xmm10
; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE-NEXT: movdqa %xmm5, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2]
; SSE-NEXT: andps %xmm8, %xmm1
; SSE-NEXT: orps %xmm6, %xmm1
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 24475360cbbc4..f84131dfc8797 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/t...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/74239
More information about the llvm-commits
mailing list