[llvm] [LLVM][CodeGen][AArch64] Don't scalarise v8{f16,bf16} vsetcc operations. (PR #135398)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 11 09:38:10 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Paul Walker (paulwalker-arm)
<details>
<summary>Changes</summary>
I have also removed custom promotion code for the v4{f16,bf16} cases because the same common code can be used.
---
Patch is 115.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135398.diff
7 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+11-25)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+3-1)
- (modified) llvm/test/Analysis/CostModel/AArch64/cmp.ll (+1-1)
- (modified) llvm/test/Analysis/CostModel/AArch64/vector-select.ll (+7-7)
- (modified) llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll (+139-762)
- (modified) llvm/test/CodeGen/AArch64/fcmp.ll (+111-471)
- (modified) llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll (+163-786)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e366d7cb54490..92b77fde8bab2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -841,11 +841,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
+ setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
setOperationAction(ISD::FABS, V4Narrow, Legal);
- setOperationAction(ISD::FNEG, V4Narrow, Legal);
+ setOperationAction(ISD::FNEG, V4Narrow, Legal);
setOperationAction(ISD::FMA, V4Narrow, Expand);
- setOperationAction(ISD::SETCC, V4Narrow, Custom);
setOperationAction(ISD::BR_CC, V4Narrow, Expand);
setOperationAction(ISD::SELECT, V4Narrow, Expand);
setOperationAction(ISD::SELECT_CC, V4Narrow, Expand);
@@ -853,6 +853,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSQRT, V4Narrow, Expand);
auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
+ setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
+ setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
+
setOperationAction(ISD::FABS, V8Narrow, Legal);
setOperationAction(ISD::FADD, V8Narrow, Legal);
setOperationAction(ISD::FCEIL, V8Narrow, Legal);
@@ -862,19 +865,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, V8Narrow, Expand);
setOperationAction(ISD::FMUL, V8Narrow, Legal);
setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
- setOperationAction(ISD::FNEG, V8Narrow, Legal);
+ setOperationAction(ISD::FNEG, V8Narrow, Legal);
setOperationAction(ISD::FROUND, V8Narrow, Legal);
setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
setOperationAction(ISD::FRINT, V8Narrow, Legal);
setOperationAction(ISD::FSQRT, V8Narrow, Expand);
setOperationAction(ISD::FSUB, V8Narrow, Legal);
setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
- setOperationAction(ISD::SETCC, V8Narrow, Expand);
setOperationAction(ISD::BR_CC, V8Narrow, Expand);
setOperationAction(ISD::SELECT, V8Narrow, Expand);
setOperationAction(ISD::SELECT_CC, V8Narrow, Expand);
setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
- setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
};
if (!Subtarget->hasFullFP16()) {
@@ -15905,6 +15906,11 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
if (LHS.getValueType().getVectorElementType().isInteger())
return Op;
+ assert((!Subtarget->hasFullFP16() &&
+ LHS.getValueType().getVectorElementType() != MVT::f16) ||
+ LHS.getValueType().getVectorElementType() != MVT::bf16 ||
+ LHS.getValueType().getVectorElementType() != MVT::f128);
+
// Lower isnan(x) | isnan(never-nan) to x != x.
// Lower !isnan(x) & !isnan(never-nan) to x == x.
if (CC == ISD::SETUO || CC == ISD::SETO) {
@@ -15923,26 +15929,6 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
}
}
- const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
-
- // Make v4f16 (only) fcmp operations utilise vector instructions
- // v8f16 support will be a litle more complicated
- if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
- LHS.getValueType().getVectorElementType() == MVT::bf16) {
- if (LHS.getValueType().getVectorNumElements() == 4) {
- LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
- RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
- SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
- DAG.ReplaceAllUsesWith(Op, NewSetcc);
- CmpVT = MVT::v4i32;
- } else
- return SDValue();
- }
-
- assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
- LHS.getValueType().getVectorElementType() != MVT::bf16 ||
- LHS.getValueType().getVectorElementType() != MVT::f128);
-
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
AArch64CC::CondCode CC1, CC2;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b1d8277182add..7356392c51df7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4236,9 +4236,11 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
auto LT = getTypeLegalizationCost(ValTy);
- // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
+ // Cost v#f16 FCmp without FP16 support via converting to v#f32 and back.
if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
+ if (LT.second == MVT::v8f16 && !ST->hasFullFP16())
+ return LT.first * 8; // 2*(fcvtl + fcvtl2 + fcmp) + uzp1 + xtn
}
// Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index c79005ee0e5d4..ea43057ec3d2d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -16,7 +16,7 @@ define void @cmps() {
; CHECK-NEXT: Cost Model: Found costs of 1 for: %cf16 = fcmp oge half undef, undef
; CHECK-NEXT: Cost Model: Found costs of 1 for: %cf32 = fcmp ogt float undef, undef
; CHECK-NEXT: Cost Model: Found costs of 1 for: %cf64 = fcmp ogt double undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cfv816 = fcmp olt <8 x half> undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cfv816 = fcmp olt <8 x half> undef, undef
; CHECK-NEXT: Cost Model: Found costs of 1 for: %cfv432 = fcmp oge <4 x float> undef, undef
; CHECK-NEXT: Cost Model: Found costs of 1 for: %cfv264 = fcmp oge <2 x double> undef, undef
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index e35eabfb35f58..452a782ed5026 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -119,7 +119,7 @@ define <4 x half> @v4f16_select_ogt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_ogt'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
@@ -184,7 +184,7 @@ define <4 x half> @v4f16_select_oge(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_oge'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
@@ -249,7 +249,7 @@ define <4 x half> @v4f16_select_olt(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_olt'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
@@ -314,7 +314,7 @@ define <4 x half> @v4f16_select_ole(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_ole'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
@@ -379,7 +379,7 @@ define <4 x half> @v4f16_select_oeq(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_oeq'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
@@ -444,7 +444,7 @@ define <4 x half> @v4f16_select_one(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_one'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
@@ -513,7 +513,7 @@ define <4 x half> @v4f16_select_une(<4 x half> %a, <4 x half> %b, <4 x half> %c)
define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; COST-NOFP16-LABEL: 'v8f16_select_une'
-; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x half> %a, %b
+; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x half> %a, %b
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c
; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1
;
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index 3a55b68f2d1a3..715693dd6ed07 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -882,61 +882,17 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_une:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
-; CHECK-NEXT: shll v2.4s, v2.4h, #16
-; CHECK-NEXT: shll v3.4s, v3.4h, #16
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v2.4s, v1.4h, #16
-; CHECK-NEXT: shll v3.4s, v0.4h, #16
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v3.4s, v4.4h, #16
-; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
-; CHECK-NEXT: csetm w9, ne
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
-; CHECK-NEXT: mov v2.h[1], w8
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
-; CHECK-NEXT: mov v2.h[2], w8
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
-; CHECK-NEXT: mov v2.h[3], w8
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
+; CHECK-NEXT: shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT: shll2 v3.4s, v0.8h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
-; CHECK-NEXT: mov v2.h[4], w8
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: mov v2.h[5], w8
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s0, s1
-; CHECK-NEXT: mov v2.h[6], w8
-; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: mov v2.h[7], w8
-; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: shl v0.8h, v0.8h, #15
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%1 = fcmp une <8 x bfloat> %a, %b
ret <8 x i1> %1
@@ -945,69 +901,21 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ueq:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
-; CHECK-NEXT: shll v2.4s, v2.4h, #16
-; CHECK-NEXT: shll v3.4s, v3.4h, #16
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v2.4s, v1.4h, #16
-; CHECK-NEXT: shll v3.4s, v0.4h, #16
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v3.4s, v4.4h, #16
-; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
-; CHECK-NEXT: csetm w9, eq
-; CHECK-NEXT: csinv w9, w9, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
-; CHECK-NEXT: mov v2.h[1], w8
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
-; CHECK-NEXT: mov v2.h[2], w8
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
-; CHECK-NEXT: mov v2.h[3], w8
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
+; CHECK-NEXT: shll2 v2.4s, v1.8h, #16
+; CHECK-NEXT: shll2 v3.4s, v0.8h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
-; CHECK-NEXT: mov v2.h[4], w8
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: mov v2.h[5], w8
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s0, s1
-; CHECK-NEXT: mov v2.h[6], w8
-; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: mov v2.h[7], w8
-; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-NEXT: fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-NEXT: fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr v1.16b, v2.16b, v4.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: shl v0.8h, v0.8h, #15
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: ret
%1 = fcmp ueq <8 x bfloat> %a, %b
ret <8 x i1> %1
@@ -1016,61 +924,17 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ugt:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
-; CHECK-NEXT: shll v2.4s, v2.4h, #16
-; CHECK-NEXT: shll v3.4s, v3.4h, #16
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v2.4s, v1.4h, #16
-; CHECK-NEXT: shll v3.4s, v0.4h, #16
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v3.4s, v4.4h, #16
-; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
-; CHECK-NEXT: csetm w9, hi
-; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
-; CHECK-NEXT: mov v2.h[1], w8
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
-; CHECK-NEXT: mov v2.h[2], w8
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
-; CHECK-NEXT: mov v2.h[3], w8
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: shll2 v2.4s, v0.8h, #16
+; CHECK-NEXT: shll2 v3.4s, v1.8h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
-; CHECK-NEXT: mov v2.h[4], w8
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: mov v2.h[5], w8
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s0, s1
-; CHECK-NEXT: mov v2.h[6], w8
-; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: mov v2.h[7], w8
-; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: fcmge v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: fcmge v...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/135398
More information about the llvm-commits
mailing list