[llvm] cdb7b80 - [DAGCombiner] fold or (xor x, y),? patterns

Tue Nov 22 17:28:16 PST 2022

Author: chenglin.bi
Date: 2022-11-23T09:28:10+08:00
New Revision: cdb7b804f665f4d250f6bad3941bcea68024e0a7

URL: https://github.com/llvm/llvm-project/commit/cdb7b804f665f4d250f6bad3941bcea68024e0a7
DIFF: https://github.com/llvm/llvm-project/commit/cdb7b804f665f4d250f6bad3941bcea68024e0a7.diff

LOG: [DAGCombiner] fold or (xor x, y),? patterns

or (xor x, y), x --> or x, y
or (xor x, y), y --> or x, y
or (xor x, y), (and x, y) --> or x, y
or (xor x, y), (or x, y) --> or x, y

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D138401

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/fshl.ll
    llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
    llvm/test/CodeGen/X86/avx512-mask-op.ll
    llvm/test/CodeGen/X86/avx512bw-mask-op.ll
    llvm/test/CodeGen/X86/avx512dq-mask-op.ll
    llvm/test/CodeGen/X86/combine-sra-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cae40db5896b8..85a92d51d3b37 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7004,6 +7004,24 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
       return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1);
   }
 
+  if (N0.getOpcode() == ISD::XOR) {
+    // fold or (xor x, y), x --> or x, y
+    //      or (xor x, y), (x and/or y) --> or x, y
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    if (N00 == N1)
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1);
+    if (N01 == N1)
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N1);
+
+    if (N1.getOpcode() == ISD::AND || N1.getOpcode() == ISD::OR) {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if ((N00 == N10 && N01 == N11) || (N00 == N11 && N01 == N10))
+        return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N01);
+    }
+  }
+
   if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
     return R;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 61a065f19ef3e..ed94e9ac7f3c6 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -704,11 +704,10 @@ define amdgpu_kernel void @orxor2or1(i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_lshl_b32 s0, s2, 7
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_xor_b32 s1, s0, s3
-; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_or_b32 s0, s3, s0
 ; SI-NEXT:    s_cmp_eq_u32 s0, 0
 ; SI-NEXT:    s_cselect_b32 s0, s2, s3
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -718,8 +717,7 @@ define amdgpu_kernel void @orxor2or1(i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b32 s4, s2, 7
-; VI-NEXT:    s_xor_b32 s5, s4, s3
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_or_b32 s4, s3, s4
 ; VI-NEXT:    s_cmp_eq_u32 s4, 0
 ; VI-NEXT:    s_cselect_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -734,8 +732,7 @@ define amdgpu_kernel void @orxor2or1(i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX9-NEXT:    s_xor_b32 s5, s4, s3
-; GFX9-NEXT:    s_or_b32 s4, s4, s5
+; GFX9-NEXT:    s_or_b32 s4, s3, s4
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
@@ -744,15 +741,14 @@ define amdgpu_kernel void @orxor2or1(i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ;
 ; R600-LABEL: orxor2or1:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 4:
 ; R600-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
 ; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
-; R600-NEXT:     XOR_INT * T1.W, PV.W, KC0[2].W,
-; R600-NEXT:     OR_INT * T0.W, T0.W, PV.W,
+; R600-NEXT:     OR_INT * T0.W, KC0[2].W, PV.W,
 ; R600-NEXT:     CNDE_INT T0.X, PV.W, KC0[2].Z, KC0[2].W,
 ; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
@@ -763,8 +759,7 @@ define amdgpu_kernel void @orxor2or1(i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
-; GFX10-NEXT:    s_xor_b32 s5, s4, s3
-; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    s_or_b32 s4, s3, s4
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
@@ -777,11 +772,10 @@ define amdgpu_kernel void @orxor2or1(i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s4, s2, 7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s5, s4, s3
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s4, s3, s4
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX11-NEXT:    s_cselect_b32 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

diff  --git a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
index b01412ba2e666..3b0592388a747 100644
--- a/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3-i1-const.ll
@@ -5,17 +5,11 @@
 define amdgpu_ps float @xor3_i1_const(float inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: xor3_i1_const:
 ; GCN:       ; %bb.0: ; %main_body
-; GCN-NEXT:    s_mov_b32 m0, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x42640000
-; GCN-NEXT:    v_cmp_nlt_f32_e64 s[2:3], s0, 0
-; GCN-NEXT:    v_interp_p2_f32 v0, v0, attr0.x
-; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, s0, v1
-; GCN-NEXT:    v_cmp_gt_f32_e64 s[0:1], 0, v0
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
-; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x42640000
+; GCN-NEXT:    v_cmp_lt_f32_e64 s[2:3], s0, 0
+; GCN-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[0:1]
 ; GCN-NEXT:    ; return to shader part epilog
 main_body:
   %tmp26 = fcmp nsz olt float %arg1, 0.000000e+00

diff  --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index f9d6ac8e9db12..81cfa94d8b1f1 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -151,21 +151,14 @@ define i16 @mand16(i16 %x, i16 %y) {
 ; CHECK-LABEL: mand16:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    orl %esi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: mand16:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    retl
   %ma = bitcast i16 %x to <16 x i1>
   %mb = bitcast i16 %y to <16 x i1>
@@ -181,9 +174,7 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    kmovw (%rdi), %k0
 ; KNL-NEXT:    kmovw (%rsi), %k1
-; KNL-NEXT:    kandw %k1, %k0, %k2
-; KNL-NEXT:    kxorw %k1, %k0, %k0
-; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; KNL-NEXT:    retq
@@ -192,9 +183,7 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    kmovw (%rdi), %k0
 ; SKX-NEXT:    kmovw (%rsi), %k1
-; SKX-NEXT:    kandw %k1, %k0, %k2
-; SKX-NEXT:    kxorw %k1, %k0, %k0
-; SKX-NEXT:    korw %k0, %k2, %k0
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq
@@ -203,9 +192,7 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k0
 ; AVX512BW-NEXT:    kmovw (%rsi), %k1
-; AVX512BW-NEXT:    kandw %k1, %k0, %k2
-; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512BW-NEXT:    korw %k0, %k2, %k0
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    retq
@@ -214,9 +201,7 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    kmovw (%rsi), %k1
-; AVX512DQ-NEXT:    kandw %k1, %k0, %k2
-; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
-; AVX512DQ-NEXT:    korw %k0, %k2, %k0
+; AVX512DQ-NEXT:    korw %k1, %k0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; AVX512DQ-NEXT:    retq
@@ -227,9 +212,7 @@ define i16 @mand16_mem(ptr %x, ptr %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    kmovw (%ecx), %k0
 ; X86-NEXT:    kmovw (%eax), %k1
-; X86-NEXT:    kandw %k1, %k0, %k2
-; X86-NEXT:    kxorw %k1, %k0, %k0
-; X86-NEXT:    korw %k0, %k2, %k0
+; X86-NEXT:    korw %k1, %k0, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll
index fda40ee095897..2a26264483613 100644
--- a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -79,9 +79,7 @@ define i32 @mand32(i32 %x, i32 %y) {
 ; CHECK-LABEL: mand32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    orl %esi, %eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i32 %x to <32 x i1>
   %mb = bitcast i32 %y to <32 x i1>
@@ -97,9 +95,7 @@ define i32 @mand32_mem(ptr %x, ptr %y) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd (%rdi), %k0
 ; CHECK-NEXT:    kmovd (%rsi), %k1
-; CHECK-NEXT:    kandd %k1, %k0, %k2
-; CHECK-NEXT:    kxord %k1, %k0, %k0
-; CHECK-NEXT:    kord %k0, %k2, %k0
+; CHECK-NEXT:    kord %k1, %k0, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    retq
   %ma = load <32 x i1>, ptr %x
@@ -115,9 +111,7 @@ define i64 @mand64(i64 %x, i64 %y) {
 ; CHECK-LABEL: mand64:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    andq %rsi, %rax
-; CHECK-NEXT:    xorq %rsi, %rdi
-; CHECK-NEXT:    orq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
   %ma = bitcast i64 %x to <64 x i1>
   %mb = bitcast i64 %y to <64 x i1>
@@ -133,9 +127,7 @@ define i64 @mand64_mem(ptr %x, ptr %y) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovq (%rdi), %k0
 ; CHECK-NEXT:    kmovq (%rsi), %k1
-; CHECK-NEXT:    kandq %k1, %k0, %k2
-; CHECK-NEXT:    kxorq %k1, %k0, %k0
-; CHECK-NEXT:    korq %k0, %k2, %k0
+; CHECK-NEXT:    korq %k1, %k0, %k0
 ; CHECK-NEXT:    kmovq %k0, %rax
 ; CHECK-NEXT:    retq
   %ma = load <64 x i1>, ptr %x

diff  --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
index d33751644a9e1..041a86aff53fb 100644
--- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -33,9 +33,8 @@ define i8 @mand8(i8 %x, i8 %y) {
 ; CHECK-LABEL: mand8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    xorb %sil, %dil
-; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i8 %x to <8 x i1>
   %mb = bitcast i8 %y to <8 x i1>
@@ -51,9 +50,7 @@ define i8 @mand8_mem(ptr %x, ptr %y) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovb (%rdi), %k0
 ; CHECK-NEXT:    kmovb (%rsi), %k1
-; CHECK-NEXT:    kandb %k1, %k0, %k2
-; CHECK-NEXT:    kxorb %k1, %k0, %k0
-; CHECK-NEXT:    korb %k0, %k2, %k0
+; CHECK-NEXT:    korb %k1, %k0, %k0
 ; CHECK-NEXT:    kmovd %k0, %eax
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/combine-sra-load.ll b/llvm/test/CodeGen/X86/combine-sra-load.ll
index 4606f38332594..aa175f07154e2 100644
--- a/llvm/test/CodeGen/X86/combine-sra-load.ll
+++ b/llvm/test/CodeGen/X86/combine-sra-load.ll
@@ -93,11 +93,11 @@ define i32 @sra_to_sextload_multiple_sra_uses(ptr %p) {
 ; CHECK-NEXT:    movswl 2(%rdi), %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    xorl $6, %eax
-; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    imull %ecx, %eax
 ; CHECK-NEXT:    retq
   %load = load i32, ptr %p
   %shift = ashr i32 %load, 16
   %use1 = xor i32 %shift, 6
-  %use2 = or i32 %shift, %use1
+  %use2 = mul i32 %shift, %use1
   ret i32 %use2
 }