[llvm] [X86] Allow handling of i128/256/512 AND/OR/XOR bitlogic on the FPU (PR #171616)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 10 05:18:41 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
If the scalar integer sources are freely transferable to the FPU, then perform the bitlogic op as a SSE/AVX operation.
Uses the mayFoldIntoVector helper added at #<!-- -->171589
---
Patch is 69.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171616.diff
6 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+29)
- (modified) llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll (+194-64)
- (modified) llvm/test/CodeGen/X86/pr166744.ll (+8-22)
- (modified) llvm/test/CodeGen/X86/ptest.ll (+137-68)
- (modified) llvm/test/CodeGen/X86/setcc-wide-types.ll (+481-562)
- (modified) llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll (+60-33)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3b3b20edbbe84..67f46c61cbeac 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1142,6 +1142,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::AND, MVT::i128, Custom);
+ setOperationAction(ISD::OR, MVT::i128, Custom);
+ setOperationAction(ISD::XOR, MVT::i128, Custom);
+
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1481,6 +1485,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
+ setOperationAction(ISD::AND, MVT::i256, Custom);
+ setOperationAction(ISD::OR, MVT::i256, Custom);
+ setOperationAction(ISD::XOR, MVT::i256, Custom);
+
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
@@ -1836,6 +1844,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI())
setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
+ setOperationAction(ISD::AND, MVT::i512, Custom);
+ setOperationAction(ISD::OR, MVT::i512, Custom);
+ setOperationAction(ISD::XOR, MVT::i512, Custom);
+
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
@@ -33919,6 +33931,23 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case X86ISD::CVTPS2PH:
Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
return;
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ assert((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
+ "Unexpected VT!");
+ // See if this is free to perform on the FPU to avoid splitting.
+ MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ if (!mayFoldIntoVector(N0, Subtarget) || !mayFoldIntoVector(N1, Subtarget))
+ return;
+ SDValue Op = DAG.getNode(Opc, dl, VecVT, DAG.getBitcast(VecVT, N0),
+ DAG.getBitcast(VecVT, N1));
+ Results.push_back(DAG.getBitcast(VT, Op));
+ return;
+ }
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// If we have at most 32 active bits, then perform as i32 CTPOP.
diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
index 6d4be7dbe6349..d9158c4af18fa 100644
--- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
+++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll
@@ -621,17 +621,41 @@ define void @vec256_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec256_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec256_i128:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: movq 8(%rdi), %rcx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, (%rsi)
-; ALL-NEXT: movq %rcx, 8(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec256_i128:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq (%rdi), %rax
+; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, (%rsi)
+; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec256_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: pxor (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsi)
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vec256_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: vec256_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: retq
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
%in.elt = xor i128 %in.elt.not, -1
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1034,19 +1058,46 @@ define void @vec384_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec384_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec384_i128:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: movq 8(%rdi), %rcx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, (%rsi)
-; ALL-NEXT: movq %rcx, 8(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: movq %rcx, 40(%rsi)
-; ALL-NEXT: movq %rax, 32(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec384_i128:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq (%rdi), %rax
+; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, (%rsi)
+; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: movq %rcx, 40(%rsi)
+; SCALAR-NEXT: movq %rax, 32(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec384_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: pxor (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsi)
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm0, 32(%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vec384_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: vec384_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX512-NEXT: retq
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
%in.elt = xor i128 %in.elt.not, -1
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1559,21 +1610,60 @@ define void @vec512_double(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec512_i128:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: movq 8(%rdi), %rcx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, (%rsi)
-; ALL-NEXT: movq %rcx, 8(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: movq %rcx, 40(%rsi)
-; ALL-NEXT: movq %rax, 32(%rsi)
-; ALL-NEXT: movq %rcx, 56(%rsi)
-; ALL-NEXT: movq %rax, 48(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec512_i128:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq (%rdi), %rax
+; SCALAR-NEXT: movq 8(%rdi), %rcx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, (%rsi)
+; SCALAR-NEXT: movq %rcx, 8(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: movq %rcx, 40(%rsi)
+; SCALAR-NEXT: movq %rax, 32(%rsi)
+; SCALAR-NEXT: movq %rcx, 56(%rsi)
+; SCALAR-NEXT: movq %rax, 48(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec512_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: pxor (%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, (%rsi)
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm0, 32(%rsi)
+; SSE-NEXT: movdqa %xmm0, 48(%rsi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vec512_i128:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, (%rsi)
+; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vec512_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vec512_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%in.elt.not = load i128, ptr %in.elt.ptr, align 64
%in.elt = xor i128 %in.elt.not, -1
%out.elt0.ptr = getelementptr i128, ptr %out.vec.ptr, i64 0
@@ -1588,25 +1678,71 @@ define void @vec512_i128(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
}
define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
-; ALL-LABEL: vec512_i256:
-; ALL: # %bb.0:
-; ALL-NEXT: movq 16(%rdi), %rax
-; ALL-NEXT: movq 24(%rdi), %rcx
-; ALL-NEXT: movq (%rdi), %rdx
-; ALL-NEXT: movq 8(%rdi), %rdi
-; ALL-NEXT: notq %rdi
-; ALL-NEXT: notq %rdx
-; ALL-NEXT: notq %rcx
-; ALL-NEXT: notq %rax
-; ALL-NEXT: movq %rax, 16(%rsi)
-; ALL-NEXT: movq %rcx, 24(%rsi)
-; ALL-NEXT: movq %rdx, (%rsi)
-; ALL-NEXT: movq %rdi, 8(%rsi)
-; ALL-NEXT: movq %rax, 48(%rsi)
-; ALL-NEXT: movq %rcx, 56(%rsi)
-; ALL-NEXT: movq %rdx, 32(%rsi)
-; ALL-NEXT: movq %rdi, 40(%rsi)
-; ALL-NEXT: retq
+; SCALAR-LABEL: vec512_i256:
+; SCALAR: # %bb.0:
+; SCALAR-NEXT: movq 16(%rdi), %rax
+; SCALAR-NEXT: movq 24(%rdi), %rcx
+; SCALAR-NEXT: movq (%rdi), %rdx
+; SCALAR-NEXT: movq 8(%rdi), %rdi
+; SCALAR-NEXT: notq %rdi
+; SCALAR-NEXT: notq %rdx
+; SCALAR-NEXT: notq %rcx
+; SCALAR-NEXT: notq %rax
+; SCALAR-NEXT: movq %rax, 16(%rsi)
+; SCALAR-NEXT: movq %rcx, 24(%rsi)
+; SCALAR-NEXT: movq %rdx, (%rsi)
+; SCALAR-NEXT: movq %rdi, 8(%rsi)
+; SCALAR-NEXT: movq %rax, 48(%rsi)
+; SCALAR-NEXT: movq %rcx, 56(%rsi)
+; SCALAR-NEXT: movq %rdx, 32(%rsi)
+; SCALAR-NEXT: movq %rdi, 40(%rsi)
+; SCALAR-NEXT: retq
+;
+; SSE-LABEL: vec512_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm1
+; SSE-NEXT: pxor 16(%rdi), %xmm0
+; SSE-NEXT: movdqa %xmm0, 16(%rsi)
+; SSE-NEXT: movdqa %xmm1, (%rsi)
+; SSE-NEXT: movdqa %xmm1, 32(%rsi)
+; SSE-NEXT: movdqa %xmm0, 48(%rsi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vec512_i256:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, 16(%rsi)
+; AVX1-NEXT: vmovaps %xmm0, (%rsi)
+; AVX1-NEXT: vextractf128 $1, %ymm0, 48(%rsi)
+; AVX1-NEXT: vmovaps %xmm0, 32(%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vec512_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, 16(%rsi)
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: vextracti128 $1, %ymm0, 48(%rsi)
+; AVX2-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vec512_i256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpxor (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, 16(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vextracti128 $1, %ymm0, 48(%rsi)
+; AVX512-NEXT: vmovdqa %xmm0, 32(%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%in.elt.not = load i256, ptr %in.elt.ptr, align 64
%in.elt = xor i256 %in.elt.not, -1
%out.elt0.ptr = getelementptr i256, ptr %out.vec.ptr, i64 0
@@ -1616,14 +1752,8 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind {
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; AVX1: {{.*}}
-; AVX2: {{.*}}
-; AVX512: {{.*}}
; AVX512BW: {{.*}}
; AVX512F: {{.*}}
-; SCALAR: {{.*}}
-; SSE: {{.*}}
; SSE2: {{.*}}
; SSE2-ONLY: {{.*}}
; SSE3: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
index ffdb68c7a6c01..8ecdc064e4dfb 100644
--- a/llvm/test/CodeGen/X86/pr166744.ll
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -14,18 +14,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
; POSTRA-NEXT: btrl %esi, %ecx
; POSTRA-NEXT: orl %ecx, %edx
; POSTRA-NEXT: movl %edx, (%rdi,%rax,4)
-; POSTRA-NEXT: movq 16(%rdi), %rax
-; POSTRA-NEXT: movq (%rdi), %rcx
-; POSTRA-NEXT: movq 24(%rdi), %rdx
-; POSTRA-NEXT: movq 8(%rdi), %rsi
-; POSTRA-NEXT: orq 56(%rdi), %rdx
-; POSTRA-NEXT: orq 40(%rdi), %rsi
-; POSTRA-NEXT: orq 48(%rdi), %rax
-; POSTRA-NEXT: orq 32(%rdi), %rcx
-; POSTRA-NEXT: orq %rdx, %rsi
-; POSTRA-NEXT: orq %rax, %rcx
-; POSTRA-NEXT: orq %rsi, %rcx
+; POSTRA-NEXT: vmovdqu (%rdi), %ymm0
+; POSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; POSTRA-NEXT: vptest %ymm0, %ymm0
; POSTRA-NEXT: setne %al
+; POSTRA-NEXT: vzeroupper
; POSTRA-NEXT: retq
;
; NOPOSTRA-LABEL: PR166744:
@@ -38,18 +31,11 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax
; NOPOSTRA-NEXT: orl %ecx, %eax
; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi)
-; NOPOSTRA-NEXT: movq 16(%rdi), %rax
-; NOPOSTRA-NEXT: movq (%rdi), %rcx
-; NOPOSTRA-NEXT: movq 8(%rdi), %rdx
-; NOPOSTRA-NEXT: movq 24(%rdi), %rsi
-; NOPOSTRA-NEXT: orq 56(%rdi), %rsi
-; NOPOSTRA-NEXT: orq 40(%rdi), %rdx
-; NOPOSTRA-NEXT: orq 48(%rdi), %rax
-; NOPOSTRA-NEXT: orq 32(%rdi), %rcx
-; NOPOSTRA-NEXT: orq %rsi, %rdx
-; NOPOSTRA-NEXT: orq %rax, %rcx
-; NOPOSTRA-NEXT: orq %rdx, %rcx
+; NOPOSTRA-NEXT: vmovdqu (%rdi), %ymm0
+; NOPOSTRA-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; NOPOSTRA-NEXT: vptest %ymm0, %ymm0
; NOPOSTRA-NEXT: setne %al
+; NOPOSTRA-NEXT: vzeroupper
; NOPOSTRA-NEXT: retq
%rem = and i64 %idx, 511
%sh_prom = zext nneg i64 %rem to i512
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 6e43b897caef1..166b7abc9e053 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -433,15 +433,23 @@ define i1 @vecmp_load64x4(ptr %p0) {
}
define i1 @vecmp_load128x2(ptr %p0) {
-; CHECK-LABEL: vecmp_load128x2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rcx
-; CHECK-NEXT: orq 24(%rdi), %rcx
-; CHECK-NEXT: orq 16(%rdi), %rax
-; CHECK-NEXT: orq %rcx, %rax
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: vecmp_load128x2:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: orq 24(%rdi), %rcx
+; SSE-NEXT: orq 16(%rdi), %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: vecmp_load128x2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpor 16(%rdi), %xmm0, %xmm0
+; AVX-NEXT: vptest %xmm0, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%p1 = getelementptr i8, ptr %p0, i64 16
%i0 = load i128, ptr %p0, align 1
%i1 = load i128, ptr %p1, align 1
@@ -453,21 +461,42 @@ define i1 @vecmp_load128x2(ptr %p0) {
}
define i1 @vecmp_load128x4(ptr %p0) {
-; CHECK-LABEL: vecmp_load128x4:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rcx
-; CHECK-NEXT: movq 24(%rdi), %rdx
-; CHECK-NEXT: movq 16(%rdi), %rsi
-; CHECK-NEXT: orq 32(%rdi), %rax
-; CHECK-NEXT: orq 40(%rdi), %rcx
-; CHECK-NEXT: orq 48(%rdi), %rsi
-; CHECK-NEXT: orq %rax, %rsi
-; CHECK-NEXT: orq 56(%rdi), %rdx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: orq %rsi, %rdx
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: vecmp_load128x4:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq 24(%rdi), %rdx
+; SSE-NEXT: movq 16(%rdi), %rsi
+; SSE-NEXT: orq 32(%rdi), %rax
+; SSE-NEXT: orq 40(%rdi), %rcx
+; SSE-NEXT: orq 48(%rdi), %rsi
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: orq 56(%rdi), %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vecmp_load128x4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT: vpor 32(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpor 48(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vptest %xmm0, %xmm0
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: vecmp_load128x4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX512-NEXT: vpor 32(%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 | xmm1 | mem
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: retq
%p1 = getelementptr i8, ptr %p0, i64 16
%p2 = getelementptr i8, ptr %p0, i64 32
%p3 = getelementptr i8, ptr %p0, i64 48
@@ -486,21 +515,39 @@ define i1 @vecmp_load128x4(ptr %p0) {
; PR144861
define i1 @vecmp_load256x2(ptr %p0) {
-; CHECK-LABEL: vecmp_load256x2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq 24(%rdi), %rax
-; CHECK-NEXT: movq (%rdi), %rcx
-; CHECK-NEXT: movq 8(%rdi), %rdx
-; CHECK-NEXT: movq 16(%rdi), %rsi
-; CHECK-NEXT: orq 48(%rdi), %rsi
-; CHECK-NEXT: orq 32(%rdi), %rcx
-; CHECK-NEXT: orq %rsi, %rcx
-; CHECK-NEXT: orq 56(%rdi), %rax
-; CHECK-NEXT: orq 40(%rdi), %rdx
-; CHECK-NEXT: orq %rax, %rdx
-; CHECK-NEXT: orq %rcx, %rdx
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: vecmp_load256x2:
+; SSE: # %bb.0:
+; SSE-NEXT: movq 24(%rdi), %rax
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq 16(%rdi), %rsi
+; SSE-NEXT: orq 48(%rdi), %rsi
+; SSE-NEXT: orq 32(%rdi), %rcx
+; SSE-NEXT: orq %rsi, %rcx
+; SSE-NEXT: orq 56(%rdi), %rax
+; SSE-NEXT: orq 40(%rdi), %rdx
+; SSE-NEXT: orq %rax, %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: vecmp_load256x2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovups (%rdi), %ymm0
+; AVX1-NEXT: vorps 32(%rdi), %ymm0, %ymm0
+; AVX1-NEXT: vptest %ymm0, %ymm0
+; AVX1-NEXT: sete %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: vecmp_load256x2:
+; AVX512: # %bb.0:
+; A...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/171616
More information about the llvm-commits
mailing list