[llvm] [X86] Improve transform for add-like nodes to `add` (PR #83691)

Sat Mar 2 12:56:58 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: None (goldsteinn)

<details>
<summary>Changes</summary>

We previously did this only in tablegen, after we would have already
dropped `disjoint` flag from `or`.

---

Patch is 337.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83691.diff


115 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+16-1) 
- (modified) llvm/lib/Target/X86/X86InstrCompiler.td (+18-1) 
- (modified) llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll (+7-4) 
- (modified) llvm/test/CodeGen/X86/3addr-or.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/addcarry2.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/and-or-fold.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/andimm8.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/atomic-unordered.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/avx512-calling-conv.ll (+146-146) 
- (modified) llvm/test/CodeGen/X86/avx512-insert-extract.ll (+17-17) 
- (modified) llvm/test/CodeGen/X86/avx512-vec-cmp.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (+32-32) 
- (modified) llvm/test/CodeGen/X86/bfloat.ll (+64-64) 
- (modified) llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll (+11-11) 
- (modified) llvm/test/CodeGen/X86/bitcast-setcc-128.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/bitcast-setcc-256.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/bitcast-setcc-512.ll (+20-20) 
- (modified) llvm/test/CodeGen/X86/bitcast-vector-bool.ll (+41-41) 
- (modified) llvm/test/CodeGen/X86/bitreverse.ll (+65-65) 
- (modified) llvm/test/CodeGen/X86/bitselect.ll (+12-11) 
- (modified) llvm/test/CodeGen/X86/bool-math.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/bool-vector.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/bswap.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/bswap_tree2.ll (+8-8) 
- (modified) llvm/test/CodeGen/X86/buildvec-insertvec.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/clz.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/combine-bitreverse.ll (+18-18) 
- (modified) llvm/test/CodeGen/X86/combine-bswap.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/combine-fneg.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/combine-rotates.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/commute-two-addr.ll (+62-13) 
- (modified) llvm/test/CodeGen/X86/dagcombine-select.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/dagcombine-shifts.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/disable-shrink-store.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/extract-bits.ll (+64-64) 
- (modified) llvm/test/CodeGen/X86/fold-masked-merge.ll (+26-22) 
- (modified) llvm/test/CodeGen/X86/fp128-i128.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/fpenv.ll (+13-13) 
- (modified) llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll (+17-17) 
- (modified) llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll (+17-17) 
- (modified) llvm/test/CodeGen/X86/fshl.ll (+10-10) 
- (modified) llvm/test/CodeGen/X86/fshr.ll (+11-11) 
- (modified) llvm/test/CodeGen/X86/funnel-shift.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/half.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll (+14-14) 
- (modified) llvm/test/CodeGen/X86/inline-spiller-impdef-on-implicit-def-regression.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/insert.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/is_fpclass-fp80.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/is_fpclass.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/kshift.ll (+8-8) 
- (modified) llvm/test/CodeGen/X86/limited-prec.ll (+9-9) 
- (modified) llvm/test/CodeGen/X86/llvm.frexp.ll (+14-14) 
- (modified) llvm/test/CodeGen/X86/load-chain.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/load-combine.ll (+46-41) 
- (modified) llvm/test/CodeGen/X86/load-local-v3i1.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/load-local-v3i129.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/load-local-v4i5.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/logic-shift.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/madd.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_compressstore.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_expandload.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_load.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_store.ll (+9-9) 
- (modified) llvm/test/CodeGen/X86/masked_store_trunc.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/masked_store_trunc_usat.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/memset-inline.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/movmsk-cmp.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/mul128.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/no-wide-load.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/or-lea.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/pr20011.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/pr23664.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/pr27202.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/pr28173.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/pr35636.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/pr35763.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/pr43820.ll (+16-16) 
- (modified) llvm/test/CodeGen/X86/pr47299.ll (+5-5) 
- (modified) llvm/test/CodeGen/X86/pr62653.ll (+38-36) 
- (modified) llvm/test/CodeGen/X86/pr69965.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/pr77459.ll (+17-17) 
- (modified) llvm/test/CodeGen/X86/promote-vec3.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/rev16.ll (+5-5) 
- (modified) llvm/test/CodeGen/X86/rotate-extract.ll (+8-8) 
- (modified) llvm/test/CodeGen/X86/select.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/select_const.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/setcc-fsh.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/shrink-compare-pgso.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/shrink-compare.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/smul_fix.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/smul_fix_sat.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/split-store.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll (+52-52) 
- (modified) llvm/test/CodeGen/X86/umul_fix_sat.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll (+35-34) 
- (modified) llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll (+28-21) 
- (modified) llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/vector-bitreverse.ll (+20-20) 
- (modified) llvm/test/CodeGen/X86/vector-compare-all_of.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/vector-compare-results.ll (+35-35) 
- (modified) llvm/test/CodeGen/X86/vector-pcmp.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/vector-sext.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll (+2-2) 
- (modified) llvm/test/CodeGen/X86/vector-shuffle-v1.ll (+4-4) 
- (modified) llvm/test/CodeGen/X86/vector-trunc.ll (+1-1) 
- (modified) llvm/test/CodeGen/X86/vector-zext.ll (+6-6) 
- (modified) llvm/test/CodeGen/X86/xor-lea.ll (+2-2) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5cbd9ab4dc2d6c..9f34a4e1870f71 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5294,11 +5294,26 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       return;
     if (tryVPTERNLOG(Node))
       return;
-
     [[fallthrough]];
   case ISD::ADD:
     if (Opcode == ISD::ADD && matchBitExtract(Node))
       return;
+
+    // Convert addlike to add before final selection. Do this before we drop
+    // flags like `disjoint`.
+    // NB: Conversion to add is preferable so we use `lea` in codegen.
+    if (Opcode != ISD::ADD && NVT.isScalarInteger() &&
+        (Opcode == ISD::OR ||
+         (NVT == MVT::i8 || NVT == MVT::i16 || NVT == MVT::i32)) &&
+        CurDAG->isADDLike(SDValue(Node, 0))
+        ) {
+      SDValue AsAdd = CurDAG->getNode(ISD::ADD, SDLoc(Node), NVT,
+                                      Node->getOperand(0), Node->getOperand(1));
+      ReplaceUses(SDValue(Node, 0), AsAdd);
+      CurDAG->RemoveDeadNode(Node);
+      Node = AsAdd.getNode();
+      Opcode = ISD::ADD;
+    }
     [[fallthrough]];
   case ISD::SUB: {
     // Try to avoid folding immediates with multiple uses for optsize.
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index f393f86e64aadd..b31c5be87a5839 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1574,7 +1574,24 @@ def : Pat<(or (and GR64:$dst, -65536),
 
 def : Pat<(or (and GR32:$dst, -65536), 
               (i32 (zextloadi16 addr:$src))),
-          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>; 
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;
+
+// We convert or -> add when the or is disjoint so need to handle for add as well.
+def : Pat<(add (and GR64:$dst, -256), 
+               (i64 (zextloadi8 addr:$src))),
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+
+def : Pat<(add (and GR32:$dst, -256), 
+               (i32 (zextloadi8 addr:$src))),
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+
+def : Pat<(add (and GR64:$dst, -65536), 
+               (i64 (zextloadi16 addr:$src))),
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;
+
+def : Pat<(add (and GR32:$dst, -65536), 
+               (i32 (zextloadi16 addr:$src))),
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;     
 
 // To avoid needing to materialize an immediate in a register, use a 32-bit and
 // with implicit zero-extension instead of a 64-bit and if the immediate has at
diff --git a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
index 609be3bb2e54f0..50e736ac68d29e 100644
--- a/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/2009-05-23-dagcombine-shifts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s | FileCheck %s
 
 ; Check that the shr(shl X, 56), 48) is not mistakenly turned into
@@ -16,11 +17,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 define i64 @foo(i64 %b) nounwind readnone {
-entry:
 ; CHECK-LABEL: foo:
-; CHECK: movsbq %dil, %rax
-; CHECK: shlq $8, %rax
-; CHECK: orq $1, %rax
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsbq %dil, %rax
+; CHECK-NEXT:    shlq $8, %rax
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    retq
+entry:
 	%shl = shl i64 %b, 56		; <i64> [#uses=1]
 	%shr = ashr i64 %shl, 48		; <i64> [#uses=1]
 	%add5 = or i64 %shr, 1		; <i64> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/3addr-or.ll b/llvm/test/CodeGen/X86/3addr-or.ll
index 65f6d2b4123e8e..1f466afcadc9ca 100644
--- a/llvm/test/CodeGen/X86/3addr-or.ll
+++ b/llvm/test/CodeGen/X86/3addr-or.ll
@@ -24,7 +24,7 @@ define i64 @test2(i8 %A, i8 %B) nounwind {
 ; CHECK-NEXT:    andl $48, %edi
 ; CHECK-NEXT:    movzbl %sil, %eax
 ; CHECK-NEXT:    shrl $4, %eax
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    retq
   %C = zext i8 %A to i64
   %D = shl i64 %C, 4
@@ -42,7 +42,7 @@ define void @test3(i32 %x, ptr %P) nounwind readnone ssp {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shll $5, %edi
-; CHECK-NEXT:    orl $3, %edi
+; CHECK-NEXT:    addl $3, %edi
 ; CHECK-NEXT:    movl %edi, (%rsi)
 ; CHECK-NEXT:    retq
   %t0 = shl i32 %x, 5
@@ -71,7 +71,7 @@ define void @test5(i32 %a, i32 %b, ptr nocapture %P) nounwind ssp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andl $6, %edi
 ; CHECK-NEXT:    andl $16, %esi
-; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    addl %edi, %esi
 ; CHECK-NEXT:    movl %esi, (%rdx)
 ; CHECK-NEXT:    retq
   %and = and i32 %a, 6
diff --git a/llvm/test/CodeGen/X86/addcarry2.ll b/llvm/test/CodeGen/X86/addcarry2.ll
index 0338577dbddc2b..1a5d0f4fe45416 100644
--- a/llvm/test/CodeGen/X86/addcarry2.ll
+++ b/llvm/test/CodeGen/X86/addcarry2.ll
@@ -138,7 +138,7 @@ define void @adc_load_store_32_127(ptr inreg %x, ptr inreg %x2, i32 inreg %y) no
 ; X64-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; X64-NEXT:    shlq $32, %rax # encoding: [0x48,0xc1,0xe0,0x20]
 ; X64-NEXT:    movl %edx, %ecx # encoding: [0x89,0xd1]
-; X64-NEXT:    orq %rax, %rcx # encoding: [0x48,0x09,0xc1]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    movabsq $545460846593, %rax # encoding: [0x48,0xb8,0x01,0x00,0x00,0x00,0x7f,0x00,0x00,0x00]
 ; X64-NEXT:    # imm = 0x7F00000001
 ; X64-NEXT:    xorl %edx, %edx # encoding: [0x31,0xd2]
@@ -178,7 +178,7 @@ define void @adc_load_store_32_128(ptr inreg %x, ptr inreg %x2, i32 inreg %y) no
 ; X64-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; X64-NEXT:    shlq $32, %rax # encoding: [0x48,0xc1,0xe0,0x20]
 ; X64-NEXT:    movl %edx, %ecx # encoding: [0x89,0xd1]
-; X64-NEXT:    orq %rax, %rcx # encoding: [0x48,0x09,0xc1]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    movabsq $549755813889, %rax # encoding: [0x48,0xb8,0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00]
 ; X64-NEXT:    # imm = 0x8000000001
 ; X64-NEXT:    xorl %edx, %edx # encoding: [0x31,0xd2]
diff --git a/llvm/test/CodeGen/X86/and-or-fold.ll b/llvm/test/CodeGen/X86/and-or-fold.ll
index 1bb5fdeebac71c..4071b364a25c3b 100644
--- a/llvm/test/CodeGen/X86/and-or-fold.ll
+++ b/llvm/test/CodeGen/X86/and-or-fold.ll
@@ -45,7 +45,7 @@ define i32 @test1(i32 %x, i16 %y) {
 ; DARWIN-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; DARWIN-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; DARWIN-NEXT:    shll $16, %eax
-; DARWIN-NEXT:    orl %ecx, %eax
+; DARWIN-NEXT:    addl %ecx, %eax
 ; DARWIN-NEXT:    andl $16711807, %eax ## imm = 0xFF007F
 ; DARWIN-NEXT:    retl
 ;
@@ -54,7 +54,7 @@ define i32 @test1(i32 %x, i16 %y) {
 ; DARWIN-OPT-NEXT:    andl $127, %esi
 ; DARWIN-OPT-NEXT:    movzbl %dil, %eax
 ; DARWIN-OPT-NEXT:    shll $16, %eax
-; DARWIN-OPT-NEXT:    orl %esi, %eax
+; DARWIN-OPT-NEXT:    addl %esi, %eax
 ; DARWIN-OPT-NEXT:    retq
   %tmp1 = zext i16 %y to i32
   %tmp2 = and i32 %tmp1, 127
diff --git a/llvm/test/CodeGen/X86/andimm8.ll b/llvm/test/CodeGen/X86/andimm8.ll
index 6242d4f4c222bb..506e28300e71b0 100644
--- a/llvm/test/CodeGen/X86/andimm8.ll
+++ b/llvm/test/CodeGen/X86/andimm8.ll
@@ -29,7 +29,7 @@ define void @foo(i64 %zed, ptr %x) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x08]
 ; X86-NEXT:    andl $-4, %ecx # encoding: [0x83,0xe1,0xfc]
-; X86-NEXT:    orl $2, %ecx # encoding: [0x83,0xc9,0x02]
+; X86-NEXT:    addl $2, %ecx # encoding: [0x83,0xc1,0x02]
 ; X86-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
 ; X86-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -37,7 +37,7 @@ define void @foo(i64 %zed, ptr %x) nounwind {
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
 ; X64-NEXT:    andq $-4, %rdi # encoding: [0x48,0x83,0xe7,0xfc]
-; X64-NEXT:    orq $2, %rdi # encoding: [0x48,0x83,0xcf,0x02]
+; X64-NEXT:    addq $2, %rdi # encoding: [0x48,0x83,0xc7,0x02]
 ; X64-NEXT:    movq %rdi, (%rsi) # encoding: [0x48,0x89,0x3e]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %t1 = and i64 %zed, -4
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index df123be53474f0..903951dd5a8cff 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2359,7 +2359,7 @@ define i16 @load_combine(ptr %p) {
 ; CHECK-O3-NEXT:    movzbl (%rdi), %ecx
 ; CHECK-O3-NEXT:    movzbl 1(%rdi), %eax
 ; CHECK-O3-NEXT:    shll $8, %eax
-; CHECK-O3-NEXT:    orl %ecx, %eax
+; CHECK-O3-NEXT:    addl %ecx, %eax
 ; CHECK-O3-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-O3-NEXT:    retq
   %v1 = load atomic i8, ptr %p unordered, align 2
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index b39b089faa2a5e..b4c37a2e34d95d 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -910,13 +910,13 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kandw %k2, %k0, %k0
 ; KNL-NEXT:    kmovw %r10d, %k2
 ; KNL-NEXT:    kandw %k1, %k2, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftrw $1, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftrw $2, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftrw $3, %k0, %k1
-; KNL-NEXT:    kmovw %k1, %esi
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftrw $4, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %edi
 ; KNL-NEXT:    kshiftrw $5, %k0, %k1
@@ -928,9 +928,9 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftrw $9, %k0, %k1
-; KNL-NEXT:    kmovw %k1, %r14d
-; KNL-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r11d
+; KNL-NEXT:    kshiftrw $10, %k0, %k1
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r15d
 ; KNL-NEXT:    kshiftrw $12, %k0, %k1
@@ -938,25 +938,25 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftrw $14, %k0, %k1
-; KNL-NEXT:    andl $1, %edx
-; KNL-NEXT:    movb %dl, 2(%rax)
-; KNL-NEXT:    kmovw %k0, %edx
-; KNL-NEXT:    andl $1, %edx
+; KNL-NEXT:    andl $1, %esi
+; KNL-NEXT:    movb %sil, 2(%rax)
+; KNL-NEXT:    kmovw %k0, %esi
+; KNL-NEXT:    andl $1, %esi
 ; KNL-NEXT:    andl $1, %r9d
-; KNL-NEXT:    leal (%rdx,%r9,2), %r9d
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    leal (%rsi,%r9,2), %r9d
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    andl $1, %r8d
 ; KNL-NEXT:    leal (%r9,%r8,4), %r9d
 ; KNL-NEXT:    kmovw %k0, %r8d
-; KNL-NEXT:    andl $1, %esi
-; KNL-NEXT:    leal (%r9,%rsi,8), %esi
+; KNL-NEXT:    andl $1, %edx
+; KNL-NEXT:    leal (%r9,%rdx,8), %edx
 ; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    shll $4, %edi
-; KNL-NEXT:    orl %esi, %edi
 ; KNL-NEXT:    andl $1, %ecx
 ; KNL-NEXT:    shll $5, %ecx
-; KNL-NEXT:    orl %edi, %ecx
+; KNL-NEXT:    addl %edi, %ecx
+; KNL-NEXT:    addl %edx, %ecx
 ; KNL-NEXT:    andl $1, %r10d
 ; KNL-NEXT:    shll $6, %r10d
 ; KNL-NEXT:    andl $1, %ebx
@@ -965,28 +965,28 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    andl $1, %ebp
 ; KNL-NEXT:    shll $8, %ebp
 ; KNL-NEXT:    orl %ebx, %ebp
-; KNL-NEXT:    andl $1, %r14d
-; KNL-NEXT:    shll $9, %r14d
-; KNL-NEXT:    orl %ebp, %r14d
 ; KNL-NEXT:    andl $1, %r11d
-; KNL-NEXT:    shll $10, %r11d
-; KNL-NEXT:    orl %r14d, %r11d
+; KNL-NEXT:    shll $9, %r11d
+; KNL-NEXT:    orl %ebp, %r11d
 ; KNL-NEXT:    orl %ecx, %r11d
+; KNL-NEXT:    andl $1, %r14d
+; KNL-NEXT:    shll $10, %r14d
 ; KNL-NEXT:    andl $1, %r15d
 ; KNL-NEXT:    shll $11, %r15d
+; KNL-NEXT:    orl %r14d, %r15d
 ; KNL-NEXT:    andl $1, %r12d
 ; KNL-NEXT:    shll $12, %r12d
 ; KNL-NEXT:    orl %r15d, %r12d
 ; KNL-NEXT:    andl $1, %r13d
 ; KNL-NEXT:    shll $13, %r13d
 ; KNL-NEXT:    orl %r12d, %r13d
-; KNL-NEXT:    andl $1, %edx
-; KNL-NEXT:    shll $14, %edx
-; KNL-NEXT:    orl %r13d, %edx
+; KNL-NEXT:    andl $1, %esi
+; KNL-NEXT:    shll $14, %esi
+; KNL-NEXT:    orl %r13d, %esi
+; KNL-NEXT:    orl %r11d, %esi
 ; KNL-NEXT:    andl $1, %r8d
 ; KNL-NEXT:    shll $15, %r8d
-; KNL-NEXT:    orl %edx, %r8d
-; KNL-NEXT:    orl %r11d, %r8d
+; KNL-NEXT:    orl %esi, %r8d
 ; KNL-NEXT:    movw %r8w, (%rax)
 ; KNL-NEXT:    popq %rbx
 ; KNL-NEXT:    popq %r12
@@ -1223,13 +1223,13 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 4-byte Reload
 ; SKX-NEXT:    kandd %k1, %k0, %k0
 ; SKX-NEXT:    kshiftrd $16, %k0, %k1
-; SKX-NEXT:    kmovd %k1, %edx
+; SKX-NEXT:    kmovd %k1, %esi
 ; SKX-NEXT:    kshiftrd $1, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r9d
 ; SKX-NEXT:    kshiftrd $2, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r8d
 ; SKX-NEXT:    kshiftrd $3, %k0, %k1
-; SKX-NEXT:    kmovd %k1, %esi
+; SKX-NEXT:    kmovd %k1, %edx
 ; SKX-NEXT:    kshiftrd $4, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %edi
 ; SKX-NEXT:    kshiftrd $5, %k0, %k1
@@ -1241,9 +1241,9 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftrd $8, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %ebp
 ; SKX-NEXT:    kshiftrd $9, %k0, %k1
-; SKX-NEXT:    kmovd %k1, %r14d
-; SKX-NEXT:    kshiftrd $10, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r11d
+; SKX-NEXT:    kshiftrd $10, %k0, %k1
+; SKX-NEXT:    kmovd %k1, %r14d
 ; SKX-NEXT:    kshiftrd $11, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r15d
 ; SKX-NEXT:    kshiftrd $12, %k0, %k1
@@ -1251,25 +1251,25 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftrd $13, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r13d
 ; SKX-NEXT:    kshiftrd $14, %k0, %k1
-; SKX-NEXT:    andl $1, %edx
-; SKX-NEXT:    movb %dl, 2(%rax)
-; SKX-NEXT:    kmovd %k0, %edx
-; SKX-NEXT:    andl $1, %edx
+; SKX-NEXT:    andl $1, %esi
+; SKX-NEXT:    movb %sil, 2(%rax)
+; SKX-NEXT:    kmovd %k0, %esi
+; SKX-NEXT:    andl $1, %esi
 ; SKX-NEXT:    andl $1, %r9d
-; SKX-NEXT:    leal (%rdx,%r9,2), %r9d
-; SKX-NEXT:    kmovd %k1, %edx
+; SKX-NEXT:    leal (%rsi,%r9,2), %r9d
+; SKX-NEXT:    kmovd %k1, %esi
 ; SKX-NEXT:    kshiftrd $15, %k0, %k0
 ; SKX-NEXT:    andl $1, %r8d
 ; SKX-NEXT:    leal (%r9,%r8,4), %r9d
 ; SKX-NEXT:    kmovd %k0, %r8d
-; SKX-NEXT:    andl $1, %esi
-; SKX-NEXT:    leal (%r9,%rsi,8), %esi
+; SKX-NEXT:    andl $1, %edx
+; SKX-NEXT:    leal (%r9,%rdx,8), %edx
 ; SKX-NEXT:    andl $1, %edi
 ; SKX-NEXT:    shll $4, %edi
-; SKX-NEXT:    orl %esi, %edi
 ; SKX-NEXT:    andl $1, %ecx
 ; SKX-NEXT:    shll $5, %ecx
-; SKX-NEXT:    orl %edi, %ecx
+; SKX-NEXT:    addl %edi, %ecx
+; SKX-NEXT:    addl %edx, %ecx
 ; SKX-NEXT:    andl $1, %r10d
 ; SKX-NEXT:    shll $6, %r10d
 ; SKX-NEXT:    andl $1, %ebx
@@ -1278,28 +1278,28 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    andl $1, %ebp
 ; SKX-NEXT:    shll $8, %ebp
 ; SKX-NEXT:    orl %ebx, %ebp
-; SKX-NEXT:    andl $1, %r14d
-; SKX-NEXT:    shll $9, %r14d
-; SKX-NEXT:    orl %ebp, %r14d
 ; SKX-NEXT:    andl $1, %r11d
-; SKX-NEXT:    shll $10, %r11d
-; SKX-NEXT:    orl %r14d, %r11d
+; SKX-NEXT:    shll $9, %r11d
+; SKX-NEXT:    orl %ebp, %r11d
 ; SKX-NEXT:    orl %ecx, %r11d
+; SKX-NEXT:    andl $1, %r14d
+; SKX-NEXT:    shll $10, %r14d
 ; SKX-NEXT:    andl $1, %r15d
 ; SKX-NEXT:    shll $11, %r15d
+; SKX-NEXT:    orl %r14d, %r15d
 ; SKX-NEXT:    andl $1, %r12d
 ; SKX-NEXT:    shll $12, %r12d
 ; SKX-NEXT:    orl %r15d, %r12d
 ; SKX-NEXT:    andl $1, %r13d
 ; SKX-NEXT:    shll $13, %r13d
 ; SKX-NEXT:    orl %r12d, %r13d
-; SKX-NEXT:    andl $1, %edx
-; SKX-NEXT:    shll $14, %edx
-; SKX-NEXT:    orl %r13d, %edx
+; SKX-NEXT:    andl $1, %esi
+; SKX-NEXT:    shll $14, %esi
+; SKX-NEXT:    orl %r13d, %esi
+; SKX-NEXT:    orl %r11d, %esi
 ; SKX-NEXT:    andl $1, %r8d
 ; SKX-NEXT:    shll $15, %r8d
-; SKX-NEXT:    orl %edx, %r8d
-; SKX-NEXT:    orl %r11d, %r8d
+; SKX-NEXT:    orl %esi, %r8d
 ; SKX-NEXT:    movw %r8w, (%rax)
 ; SKX-NEXT:    popq %rbx
 ; SKX-NEXT:    popq %r12
@@ -1556,9 +1556,9 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kshiftrw $1, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
 ; KNL_X32-NEXT:    kshiftrw $2, %k0, %k1
-; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edi
+; KNL_X32-NEXT:    kshiftrw $3, %k0, %k1
+; KNL_X32-NEXT:    kmovw %k1, %esi
 ; KNL_X32-NEXT:    kshiftrw $4, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %edx
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
@@ -1569,67 +1569,67 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    leal (%ebx,%ebp,2), %ebx
-; KNL_X32-NEXT:    kmovw %k1, %ebp
+; KNL_X32-NEXT:    leal (%ebx,%ebp,2), %ebp
+; KNL_X32-NEXT:    kmovw %k1, %ebx
 ; KNL_X32-NEXT:    kshiftrw $7, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    leal (%ebx,%esi,4), %ebx
-; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    leal (%ebx,%edi,8), %ebx
+; KNL_X32-NEXT:    leal (%ebp,%edi,4), %ebp
 ; KNL_X32-NEXT:    kmovw %k1, %edi
+; KNL_X32-NEXT:    kshiftrw $8, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %esi
+; KNL_X32-NEXT:    leal (%ebp,%esi,8), %ebp
+; KNL_X32-NEXT:    kmovw %k1, %esi
 ; KNL_X32-NEXT:    kshiftrw $9, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
 ; KNL_X32-NEXT:    shll $4, %edx
-; KNL_X32-NEXT:    orl %ebx, %edx
-; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ecx
 ; KNL_X32-NEXT:    shll $5, %ecx
-; KNL_X32-NEXT:    orl %edx, %ecx
+; KNL_X32-NEXT:    addl %edx, %ecx
 ; KNL_X32-NEXT:    kmovw %k1, %edx
-; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    shll $6, %ebp
-; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    shll $7, %esi
-; KNL_X32-NEXT:    orl %ebp, %esi
+; KNL_X32-NEXT:    kshiftrw $10, %k0, %k1
+; KNL_X32-NEXT:    addl %ebp, %ecx
 ; KNL_X32-NEXT:    kmovw %k1, %ebp
-; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    shll $8, %edi
-; KNL_X32-NEXT:    orl %esi, %edi
-; KNL_X32-NEXT:    kmovw %k1, %esi
-; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
+; KNL_X32-NEXT:    kshiftrw $11, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    shll $9, %ebx
-; KNL_X32-NEXT:    orl %edi, %ebx
+; KNL_X32-NEXT:    shll $6, %ebx
+; KNL_X32-NEXT:    andl $1, %edi
+; KNL_X32-NEXT:    shll $7, %edi
+; KNL_X32-NEXT:    orl %ebx, %edi
+; KNL_X32-NEXT:    kmovw %k1, %ebx
+; KNL_X32-NEXT:    kshiftrw $12, %k0, %k1
+; KNL_X32-NEXT:    andl $1, %esi
+; KNL_X32-NEXT:    shll $8, %esi
+; KNL_X32-NEXT:    orl %edi, %esi
 ; KNL_X32-NEXT:    kmovw %k1, %edi
-; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
+; KNL_X32-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL_X32-NEXT:    andl $1, %edx
-; KNL_X32-NEXT:    shll $10, %edx
-; KNL_X32-NEXT:    orl %ebx, %edx
-; KNL_X32-NEXT:    kmovw %k1, %ebx
-; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
+; KNL_X32-NEXT:    shll $9, %edx
+; KNL_X32-NEXT:    orl %esi, %edx
+; KNL_X32-NEXT:    kmovw %k1, %esi
+; KNL_X32-NEXT:    kshiftrw $14, %k0, %k1
 ; KNL_X32-NEXT:    orl %ecx, %edx
-; KNL_X32-NEXT:    kmovw %k0, %ecx
+; KNL_X32-NEXT:    kmovw %k1, %ecx
+; KNL_X32-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL_X32-NEXT:    andl $1, %ebp
-; KNL_X32-NEXT:    shll $11, %ebp
-; KNL_X32-NEXT:    andl $1, %esi
-; KNL_X32-NEXT:    shll $12, %esi
-; KNL_X32-NEXT:    orl %ebp, %esi
-; KNL_X32-NEXT:    andl $1, %edi
-; KNL_X32-NEXT:    shll $13, %edi
-; KNL_X32-NEXT:    orl %esi, %edi
+; KNL_X32-NEXT:    shll $10, %ebp
 ; KNL_X32-NEXT:    andl $1, %ebx
-; KNL_X32-NEXT:    shll $14, %ebx
-; KNL_X32-NEXT:    orl %edi, %ebx
+; KNL_X32-NEXT:    shll $11, %ebx
+; KNL_X...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/83691