[llvm] [DAG] SimplifyDemandedBits - ignore SRL node if we're just demanding known sign bits (PR #114805)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 4 06:46:08 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Check to see if we are only demanding (shifted) signbits from a SRL node that are also signbits in the source node.
We can't demand any upper zero bits that the SRL will shift in (up to max shift amount), and the lower demanded bits bound must already be all signbits.
Same fold as #<!-- -->114389 which added this for SimplifyMultipleUseDemandedBits
---
Patch is 43.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114805.diff
3 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+15)
- (modified) llvm/test/CodeGen/NVPTX/load-store.ll (+72-72)
- (modified) llvm/test/CodeGen/X86/scmp.ll (+290-313)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f21233abfa4f5d..a16ec19e7a6888 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2030,6 +2030,21 @@ bool TargetLowering::SimplifyDemandedBits(
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
}
+ // If we are only demanding sign bits then we can use the shift source
+ // directly.
+ if (std::optional<uint64_t> MaxSA =
+ TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) {
+ unsigned ShAmt = *MaxSA;
+ // Must already be signbits in DemandedBits bounds, and can't demand any
+ // shifted in zeroes.
+ if (DemandedBits.countl_zero() >= ShAmt) {
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ if (DemandedBits.countr_zero() >= (BitWidth - NumSignBits))
+ return TLO.CombineTo(Op, Op0);
+ }
+ }
+
// Try to match AVG patterns (after shift simplification).
if (SDValue AVG = combineShiftToAVG(Op, TLO, *this, DemandedBits,
DemandedElts, Depth + 1))
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index a4be81a1973a33..ee289c4faab506 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -167,25 +167,25 @@ define void @generic_4xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0];
; CHECK-NEXT: ld.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load <4 x i8>, ptr %a
@@ -511,25 +511,25 @@ define void @generic_volatile_4xi8(ptr %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0];
; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.volatile.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i8>, ptr %a
@@ -1416,25 +1416,25 @@ define void @global_4xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0];
; CHECK-NEXT: ld.global.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.global.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load <4 x i8>, ptr addrspace(1) %a
@@ -1741,25 +1741,25 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0];
; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i8>, ptr addrspace(1) %a
@@ -2788,25 +2788,25 @@ define void @shared_4xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0];
; CHECK-NEXT: ld.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.shared.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load <4 x i8>, ptr addrspace(3) %a
@@ -3113,25 +3113,25 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0];
; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i8>, ptr addrspace(3) %a
@@ -4018,25 +4018,25 @@ define void @local_4xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0];
; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.local.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load <4 x i8>, ptr addrspace(5) %a
@@ -4343,25 +4343,25 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) {
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0];
; CHECK-NEXT: ld.local.u32 %r1, [%rd1];
-; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
+; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
+; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT: add.s16 %rs4, %rs3, 1;
; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
-; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
-; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8;
+; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120;
+; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8;
; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT: add.s16 %rs6, %rs5, 1;
; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
-; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
-; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8;
-; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
+; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8;
+; CHECK-NEXT: cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT: add.s16 %rs8, %rs7, 1;
-; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
-; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
+; CHECK-NEXT: cvt.u32.u16 %r10, %rs8;
+; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120;
+; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520;
; CHECK-NEXT: st.local.u32 [%rd1], %r12;
; CHECK-NEXT: ret;
%a.load = load volatile <4 x i8>, ptr addrspace(5) %a
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 537e05310dbea8..874913629e9e3f 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -1764,153 +1764,146 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
-; SSE2-NEXT: addb %dil, %dil
-; SSE2-NEXT: sarb %dil
-; SSE2-NEXT: addb %sil, %sil
-; SSE2-NEXT: sarb %sil
-; SSE2-NEXT: cmpb %dil, %sil
-; SSE2-NEXT: setl %sil
-; SSE2-NEXT: setg %dil
-; SSE2-NEXT: subb %sil, %dil
-; SSE2-NEXT: movsbq %dil, %rdi
-; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq %rdi, (%rax)
-; SSE2-NEXT: sarq $63, %rdi
-; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: addb %r11b, %r11b
-; SSE2-NEXT: sarb %r11b
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; SSE2-NEXT: addb %r15b, %r15b
+; SSE2-NEXT: sarb %r15b
; SSE2-NEXT: addb %sil, %sil
; SSE2-NEXT: sarb %sil
-; SSE2-NEXT: cmpb %r11b, %sil
+; SSE2-NEXT: cmpb %r15b, %sil
; SSE2-NEXT: setl %sil
-; SSE2-NEXT: setg %r11b
-; SSE2-NEXT: subb %sil, %r11b
-; SSE2-NEXT: movsbq %r11b, %r11
-; SSE2-NEXT: movq %r11, %r14
-; SSE2-NEXT: sarq $63, %r14
-; SSE2-NEXT: addb %r12b, %r12b
-; SSE2-NEXT: sarb %r12b
+; SSE2-NEXT: setg %r15b
+; SSE2-NEXT: subb %sil, %r15b
+; SSE2-NEXT: movsbq %r15b, %rsi
+; SSE2-NEXT: movq %rsi, (%rax)
+; SSE2-NEXT: movq %rsi, %xmm0
+; SSE2-NEXT: sarq $63, %rsi
+; SSE2-NEXT: addb %r14b, %r14b
+; SSE2-NEXT: sarb %r14b
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; SSE2-NEXT: addb %r15b, %r15b
+; SSE2-NEXT: sarb %r15b
+; SSE2-NEXT: cmpb %r14b, %r15b
+; SSE2-NEXT: setl %r14b
+; SSE2-NEXT: setg %r15b
+; SSE2-NEXT: subb %r14b, %r15b
+; SSE2-NEXT: movsbq %r15b, %r14
+; SSE2-NEXT: movq %r14, %r15
+; SSE2-NEXT: sarq $63, %r15
+; SSE2-NEXT: addb %bpl, %bpl
+; SSE2-NEXT: sarb %bpl
; SSE2-NEXT: addb %dl, %dl
; SSE2-NEXT: sarb %dl
-; SSE2-NEXT: cmpb %r12b, %dl
+; SSE2-NEXT: cmpb %bpl, %dl
; SSE2-NEXT: setl %dl
-; SSE2-NEXT: setg %sil
-; SSE2-NEXT: subb %dl, %sil
-; SSE2-NEXT: movsbq %sil, %r13
-; SSE2-NEXT: movq %r13, %rdi
-; SSE2-NEXT: sarq $63, %rdi
-; SSE2-NEXT: addb %r15b, %r15b
-; SSE2-NEXT: sarb %r15b
+; SSE2-NEXT: setg %bpl
+; SSE2-NEXT: subb %dl, %bpl
+; SSE2-NEXT: movsbq %bpl, %rdx
+; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: sarq $63, %r12
+; SSE2-NEXT: addb %bl, %bl
+; SSE2-NEXT: sarb %bl
; SSE2-NEXT: addb %cl, %cl
; SSE2-NEXT: sarb %cl
-; SSE2-NEXT: cmpb %r15b, %cl
+; SSE2-NEXT: cmpb %bl, %cl
; SSE2-NEXT: setl %cl
-; SSE2-NEXT: setg %dl
-; SSE2-NEXT: subb %cl, %dl
-; SSE2-NEXT: movsbq %dl, %r15
-; SSE2-NEXT: movq %r15, %rcx
+; SSE2-NEXT: setg %bl
+; SSE2-NEXT: subb %cl, %bl
+; SSE2-NEXT: movsbq %bl, %rbx
+; SSE2-NEXT: movq %rbx, %rcx
; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: addb %bpl, %bpl
-; SSE2-NEXT: sarb %bpl
+; SSE2-NEXT: addb %r11b, %r11b
+; SSE2-NEXT: sarb %r11b
; SSE2-NEXT: addb %r8b, %r8b
; SSE2-NEXT: sarb %r8b
-; SSE2-NEXT: cmpb %bpl, %r8b
-; SSE2-NEXT: setl %dl
-; SSE2-NEXT: setg %r8b
-; SSE2-NEXT: subb %dl, %r8b
-; SSE2-NEXT: movsbq %r8b, %r8
-; SSE2-NEXT: movq %r8, %r12
-; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: addb %bl, %bl
-; SSE2-NEXT: sarb %bl
-; SSE2-NEXT: addb %r9b, %r9b
-; SSE2-NEXT: sarb %r9b
-; SSE2-NEXT: cmpb %bl, %r9b
-; SSE2-NEXT: setl %dl
-; SSE2-NEXT: setg %r9b
-; SSE2-NEXT: subb %dl, %r9b
-; SSE2-NEXT: movsbq %r9b, %rsi
-; SSE2-NEXT: movq %rsi, %r9
-; SSE2-NEXT: sarq $63, %r9
+; SSE2-NEXT: cmpb %r11b, %r8b
+; SSE2-NEXT: setl %r8b
+; SSE2-NEXT: setg %r11b
+; SSE2-NEXT: subb %r8b, %r11b
+; SSE2-NEXT: movsbq %r11b, %r8
+; SSE2-NEXT: movq %r8, %r11
+; SSE2-NEXT: sarq $63, %r11
; SSE2-NEXT: addb %r10b, %r10b
; SSE2-NEXT: sarb %r10b
-; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
-; SSE2-NEXT: addb %dl, %dl
-; SSE2-NEXT: sarb %dl
-; SSE2-NEXT: cmpb %r10b, %dl
-; SSE2-NEXT: setl %dl
+; SSE2-NEXT: addb %r9b, %r9b
+; SSE2-NEXT: sarb %r9b
+; SSE2-NEXT: cmpb %r10b, %r9b
+; SSE2-NEXT: setl %r9b
; SSE2-NEXT: setg %r10b
-; SSE2-NEXT: subb %dl, %r10b
-; SSE2-NEXT: movsbq %r10b, %r10
-; SSE2-NEXT: movq %r10, %rdx
-; SSE2-NEXT: sarq $63, %rdx
-; SSE2-NEXT: movl %edx, 96(%rax)
+; SSE2-NEXT: subb %r9b, %r10b
+; SSE2-NEXT: movsbq %r10b, %r9
+; SSE2-NEXT: movq %r9, %r10
+; SSE2-NEXT: sarq $63, %r10
+; SSE2-NEXT: addb %dil, %dil
+; SSE2-NEXT: sarb %dil
+; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; SSE2-NEXT: addb %bpl, %bpl
+; SSE2-NEXT: sarb %bpl
+; SSE2-NEXT: cmpb %dil, %bpl
+; SSE2-NEXT: setl %dil
+; SSE2-NEXT: setg %bpl
+; SSE2-NEXT: subb %dil, %bpl
+; SSE2-NEXT: movsbq %bpl, %rdi
+; SSE2-NEXT: movq %rdi, %r13
+; SSE2-NEXT: sarq $63, %r13
+; SSE2-NEXT: movl %r13d, 96(%rax)
; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF
-; SSE2-NEXT: andq %rdx, %rbp
-; SSE2-NEXT: shldq $62, %r10, %rdx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
-; SSE2-NEXT: # xmm0 = mem[0],zero
-; SSE2-NEXT: movq %r9, %rbx
-; SSE2-NEXT: shldq $20, %rsi, %rbx
-; SSE2-NEXT: movq %rdx, 88(%rax)
-; SSE2-NEXT: movq %r12, %rdx
-; SSE2-NEXT: shldq $31, %r8, %rdx
-; SSE2-NEXT: movq %rbx, 64(%rax)
-; SSE2-NEXT: movq %rcx, %rbx
-; SSE2-NEXT: shldq $42, %r15, %rbx
-; SSE2-NEXT: movq %rdx, 48(%rax)
-; SSE2-NEXT: movq %rbx, 32(%rax)
-; SSE2-NEXT: movabsq $9007199254738944, %rbx # imm = 0x1FFFFFFFFFF800
-; SSE2-NEXT: andq %rdi, %rbx
-; SSE2-NEXT: shldq $53, %r13, %rdi
-; SSE2-NEXT: movq %rdi, 16(%rax)
-; SSE2-NEXT: movq %rbp, %rdx
-; SSE2-NEXT: shrq $48, %rdx
-; SSE2-NEXT: movb %dl, 102(%rax)
+; SSE2-NEXT: andq %r13, %rbp
+; SSE2-NEXT: shldq $62, %rdi, %r13
+; SSE2-NEXT: movq %r13, 88(%rax)
+; SSE2-NEXT: movq %r1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114805
More information about the llvm-commits
mailing list