[llvm] r373138 - [X86] Add broadcast load unfolding support for VPTESTMD/Q and VPTESTNMD/Q.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 27 18:56:36 PDT 2019


Author: ctopper
Date: Fri Sep 27 18:56:36 2019
New Revision: 373138

URL: http://llvm.org/viewvc/llvm-project?rev=373138&view=rev
Log:
[X86] Add broadcast load unfolding support for VPTESTMD/Q and VPTESTNMD/Q.

Modified:
    llvm/trunk/lib/Target/X86/X86InstrFoldTables.cpp
    llvm/trunk/test/CodeGen/X86/avx512-broadcast-unfold.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrFoldTables.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFoldTables.cpp?rev=373138&r1=373137&r2=373138&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFoldTables.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFoldTables.cpp Fri Sep 27 18:56:36 2019
@@ -5374,6 +5374,18 @@ static const X86MemoryFoldTableEntry Bro
   { X86::VPORQZ128rr,    X86::VPORQZ128rmb,    TB_BCAST_Q },
   { X86::VPORQZ256rr,    X86::VPORQZ256rmb,    TB_BCAST_Q },
   { X86::VPORQZrr,       X86::VPORQZrmb,       TB_BCAST_Q },
+  { X86::VPTESTMDZ128rr, X86::VPTESTMDZ128rmb, TB_BCAST_D },
+  { X86::VPTESTMDZ256rr, X86::VPTESTMDZ256rmb, TB_BCAST_D },
+  { X86::VPTESTMDZrr,    X86::VPTESTMDZrmb,    TB_BCAST_D },
+  { X86::VPTESTMQZ128rr, X86::VPTESTMQZ128rmb, TB_BCAST_Q },
+  { X86::VPTESTMQZ256rr, X86::VPTESTMQZ256rmb, TB_BCAST_Q },
+  { X86::VPTESTMQZrr,    X86::VPTESTMQZrmb,    TB_BCAST_Q },
+  { X86::VPTESTNMDZ128rr,X86::VPTESTNMDZ128rmb,TB_BCAST_D },
+  { X86::VPTESTNMDZ256rr,X86::VPTESTNMDZ256rmb,TB_BCAST_D },
+  { X86::VPTESTNMDZrr,   X86::VPTESTNMDZrmb,   TB_BCAST_D },
+  { X86::VPTESTNMQZ128rr,X86::VPTESTNMQZ128rmb,TB_BCAST_Q },
+  { X86::VPTESTNMQZ256rr,X86::VPTESTNMQZ256rmb,TB_BCAST_Q },
+  { X86::VPTESTNMQZrr,   X86::VPTESTNMQZrmb,   TB_BCAST_Q },
   { X86::VPXORDZ128rr,   X86::VPXORDZ128rmb,   TB_BCAST_D },
   { X86::VPXORDZ256rr,   X86::VPXORDZ256rmb,   TB_BCAST_D },
   { X86::VPXORDZrr,      X86::VPXORDZrmb,      TB_BCAST_D },

Modified: llvm/trunk/test/CodeGen/X86/avx512-broadcast-unfold.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-broadcast-unfold.ll?rev=373138&r1=373137&r2=373138&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-broadcast-unfold.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-broadcast-unfold.ll Fri Sep 27 18:56:36 2019
@@ -4483,3 +4483,153 @@ define void @bcast_unfold_cmp_v8f32_refo
 12:                                               ; preds = %2
   ret void
 }
+
+define void @bcast_unfold_ptestm_v4i32(i32* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB127_1: # %bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
+; CHECK-NEXT:    vptestmd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
+; CHECK-NEXT:    addq $16, %rax
+; CHECK-NEXT:    jne .LBB127_1
+; CHECK-NEXT:  # %bb.2: # %bb10
+; CHECK-NEXT:    retq
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
+  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
+  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
+  %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
+  %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
+  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
+  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
+  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
+  %tmp8 = add i64 %tmp, 4
+  %tmp9 = icmp eq i64 %tmp8, 1024
+  br i1 %tmp9, label %bb10, label %bb1
+
+bb10:                                             ; preds = %bb1
+  ret void
+}
+
+define void @bcast_unfold_ptestnm_v4i32(i32* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB128_1: # %bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
+; CHECK-NEXT:    vptestnmd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
+; CHECK-NEXT:    addq $16, %rax
+; CHECK-NEXT:    jne .LBB128_1
+; CHECK-NEXT:  # %bb.2: # %bb10
+; CHECK-NEXT:    retq
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
+  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
+  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
+  %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
+  %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
+  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
+  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
+  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
+  %tmp8 = add i64 %tmp, 4
+  %tmp9 = icmp eq i64 %tmp8, 1024
+  br i1 %tmp9, label %bb10, label %bb1
+
+bb10:                                             ; preds = %bb1
+  ret void
+}
+
+define void @bcast_unfold_ptestm_v4i64(i64* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB129_1: # %bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
+; CHECK-NEXT:    vptestmq %ymm0, %ymm1, %k1
+; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
+; CHECK-NEXT:    addq $32, %rax
+; CHECK-NEXT:    jne .LBB129_1
+; CHECK-NEXT:  # %bb.2: # %bb10
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
+  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
+  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
+  %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
+  %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
+  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
+  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
+  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
+  %tmp8 = add i64 %tmp, 4
+  %tmp9 = icmp eq i64 %tmp8, 1024
+  br i1 %tmp9, label %bb10, label %bb1
+
+bb10:                                             ; preds = %bb1
+  ret void
+}
+
+define void @bcast_unfold_ptestnm_v4i64(i64* %arg) {
+; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB130_1: # %bb1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
+; CHECK-NEXT:    vptestnmq %ymm0, %ymm1, %k1
+; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
+; CHECK-NEXT:    addq $32, %rax
+; CHECK-NEXT:    jne .LBB130_1
+; CHECK-NEXT:  # %bb.2: # %bb10
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
+  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
+  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
+  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
+  %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
+  %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
+  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
+  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
+  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
+  %tmp8 = add i64 %tmp, 4
+  %tmp9 = icmp eq i64 %tmp8, 1024
+  br i1 %tmp9, label %bb10, label %bb1
+
+bb10:                                             ; preds = %bb1
+  ret void
+}




More information about the llvm-commits mailing list