[llvm] [AArch64] Use pattern to select bf16 fpextend (PR #137212)
John Brawn via llvm-commits
llvm-commits at lists.llvm.org
Fri May 2 03:49:19 PDT 2025
https://github.com/john-brawn-arm updated https://github.com/llvm/llvm-project/pull/137212
>From 1616698c6568802f1d451f89f6e5badd146e0b59 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Tue, 15 Apr 2025 16:51:50 +0100
Subject: [PATCH 1/3] [AArch64] Use pattern to select bf16 fpextend
Currently bf16 fpextend is lowered to a vector shift. Instead leave it
as fpextend and have an instruction selection pattern which selects to
a shift later. Doing this means that DAGCombiner patterns for fpextend
will be applied, leading to better codegen. It also means that in some
situations we use a mov instruction where we previously have a dup
instruction, but I don't think this makes any difference.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 38 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 18 +
.../arm64-fast-isel-conversion-fallback.ll | 8 +-
llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 8 +-
llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 8 +-
.../test/CodeGen/AArch64/bf16-instructions.ll | 18 +-
.../CodeGen/AArch64/bf16-v8-instructions.ll | 628 +++++++++---------
...e-streaming-mode-fixed-length-fcopysign.ll | 28 +-
8 files changed, 361 insertions(+), 393 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 63924dc1b30ea..17181cba0ceb2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -766,13 +766,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(Op, MVT::v8bf16, Expand);
}
- // For bf16, fpextend is custom lowered to be optionally expanded into shifts.
- setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
+ // fpextend from f16 or bf16 to f32 is legal
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Legal);
+ // fpextend from bf16 to f64 needs to be split into two fpextends
setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom);
auto LegalizeNarrowFP = [this](MVT ScalarVT) {
for (auto Op : {
@@ -4548,33 +4549,6 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
return SDValue();
}
- if (VT.getScalarType() == MVT::f32) {
- // FP16->FP32 extends are legal for v32 and v4f32.
- if (Op0VT.getScalarType() == MVT::f16)
- return Op;
- if (Op0VT.getScalarType() == MVT::bf16) {
- SDLoc DL(Op);
- EVT IVT = VT.changeTypeToInteger();
- if (!Op0VT.isVector()) {
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0);
- IVT = MVT::v4i32;
- }
-
- EVT Op0IVT = Op0.getValueType().changeTypeToInteger();
- SDValue Ext =
- DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0));
- SDValue Shift =
- DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT));
- if (!Op0VT.isVector())
- Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift,
- DAG.getConstant(0, DL, MVT::i64));
- Shift = DAG.getBitcast(VT, Shift);
- return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL)
- : Shift;
- }
- return SDValue();
- }
-
assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
return SDValue();
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f7b13092821d6..3562406738c93 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8513,6 +8513,24 @@ def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
}
+// fpextend from bf16 to f32 is just a shift left by 16
+let Predicates = [HasNEON] in {
+def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
+ (f32 (EXTRACT_SUBREG
+ (v4i32 (SHLLv4i16 (v4i16 (SUBREG_TO_REG (i64 0), (bf16 FPR16:$Rn), hsub)))),
+ ssub))>;
+def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))),
+ (SHLLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (any_fpextend (extract_high_v8bf16 (v8bf16 V128:$Rn)))),
+ (SHLLv8i16 V128:$Rn)>;
+}
+// Fallback pattern for when we don't have NEON
+def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
+ (f32 (COPY_TO_REGCLASS
+ (i32 (UBFMWri (i32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)),
+ (i64 16), (i64 15))),
+ FPR32))>;
+
def abs_f16 :
OutPatFrag<(ops node:$Rn),
(EXTRACT_SUBREG (f32 (COPY_TO_REGCLASS
diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
index 9a1203f18243d..1d33545cb171a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll
@@ -155,9 +155,7 @@ entry:
define i32 @fptosi_bf(bfloat %a) nounwind ssp {
; CHECK-LABEL: fptosi_bf:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s1, s0
-; CHECK-NEXT: // implicit-def: $d0
-; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: // kill: def $d0 killed $h0
; CHECK-NEXT: shll v0.4s, v0.4h, #16
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-NEXT: fcvtzs w0, s0
@@ -171,9 +169,7 @@ entry:
define i32 @fptoui_sbf(bfloat %a) nounwind ssp {
; CHECK-LABEL: fptoui_sbf:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s1, s0
-; CHECK-NEXT: // implicit-def: $d0
-; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: // kill: def $d0 killed $h0
; CHECK-NEXT: shll v0.4s, v0.4h, #16
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-NEXT: fcvtzu w0, s0
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
index 9b5e48d2b4217..e3e18a1f91c6d 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll
@@ -641,7 +641,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
; NOLSE: // %bb.0:
; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT: dup v1.4h, v0.h[1]
+; NOLSE-NEXT: mov h1, v0.h[1]
; NOLSE-NEXT: mov w8, #32767 // =0x7fff
; NOLSE-NEXT: shll v0.4s, v0.4h, #16
; NOLSE-NEXT: shll v1.4s, v1.4h, #16
@@ -649,7 +649,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
; NOLSE-NEXT: ldaxr w9, [x0]
; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: dup v3.4h, v2.h[1]
+; NOLSE-NEXT: mov h3, v2.h[1]
; NOLSE-NEXT: shll v2.4s, v2.4h, #16
; NOLSE-NEXT: fmaxnm s2, s2, s0
; NOLSE-NEXT: shll v3.4s, v3.4h, #16
@@ -677,14 +677,14 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4:
; LSE: // %bb.0:
; LSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT: dup v1.4h, v0.h[1]
+; LSE-NEXT: mov h1, v0.h[1]
; LSE-NEXT: shll v2.4s, v0.4h, #16
; LSE-NEXT: mov w8, #32767 // =0x7fff
; LSE-NEXT: ldr s0, [x0]
; LSE-NEXT: shll v1.4s, v1.4h, #16
; LSE-NEXT: .LBB7_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: dup v3.4h, v0.h[1]
+; LSE-NEXT: mov h3, v0.h[1]
; LSE-NEXT: shll v4.4s, v0.4h, #16
; LSE-NEXT: fmaxnm s4, s4, s2
; LSE-NEXT: shll v3.4s, v3.4h, #16
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
index f6c542fe7d407..10de6777bd285 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll
@@ -641,7 +641,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
; NOLSE: // %bb.0:
; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; NOLSE-NEXT: dup v1.4h, v0.h[1]
+; NOLSE-NEXT: mov h1, v0.h[1]
; NOLSE-NEXT: mov w8, #32767 // =0x7fff
; NOLSE-NEXT: shll v0.4s, v0.4h, #16
; NOLSE-NEXT: shll v1.4s, v1.4h, #16
@@ -649,7 +649,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1
; NOLSE-NEXT: ldaxr w9, [x0]
; NOLSE-NEXT: fmov s2, w9
-; NOLSE-NEXT: dup v3.4h, v2.h[1]
+; NOLSE-NEXT: mov h3, v2.h[1]
; NOLSE-NEXT: shll v2.4s, v2.4h, #16
; NOLSE-NEXT: fminnm s2, s2, s0
; NOLSE-NEXT: shll v3.4s, v3.4h, #16
@@ -677,14 +677,14 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4:
; LSE: // %bb.0:
; LSE-NEXT: // kill: def $d0 killed $d0 def $q0
-; LSE-NEXT: dup v1.4h, v0.h[1]
+; LSE-NEXT: mov h1, v0.h[1]
; LSE-NEXT: shll v2.4s, v0.4h, #16
; LSE-NEXT: mov w8, #32767 // =0x7fff
; LSE-NEXT: ldr s0, [x0]
; LSE-NEXT: shll v1.4s, v1.4h, #16
; LSE-NEXT: .LBB7_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: dup v3.4h, v0.h[1]
+; LSE-NEXT: mov h3, v0.h[1]
; LSE-NEXT: shll v4.4s, v0.4h, #16
; LSE-NEXT: fminnm s4, s4, s2
; LSE-NEXT: shll v3.4s, v3.4h, #16
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index 2fc9c53112ab6..1dd883580715e 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -1996,13 +1996,11 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
; CHECK-CVT-LABEL: test_copysign_extended:
; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT: movi v2.4s, #16
; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0
; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16
; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-CVT-NEXT: fmov w8, s0
; CHECK-CVT-NEXT: lsr w8, w8, #16
@@ -2013,16 +2011,12 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
;
; CHECK-SD-LABEL: test_copysign_extended:
; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT: movi v2.4s, #16
; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0
; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24
-; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b
-; CHECK-SD-NEXT: bfcvt h0, s0
+; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16
; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16
+; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
index 3a55b68f2d1a3..f4ab8ff581e23 100644
--- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -882,11 +882,11 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_une:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -896,34 +896,34 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, ne
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, ne
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, ne
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -945,54 +945,54 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ueq:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: shll v6.4s, v6.4h, #16
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v2.4s, v1.4h, #16
; CHECK-NEXT: shll v3.4s, v0.4h, #16
; CHECK-NEXT: csetm w8, eq
; CHECK-NEXT: csinv w8, w8, wzr, vc
; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v3.4s, v4.4h, #16
+; CHECK-NEXT: shll v2.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h3, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[4]
; CHECK-NEXT: csetm w9, eq
; CHECK-NEXT: csinv w9, w9, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
+; CHECK-NEXT: fcmp s4, s2
+; CHECK-NEXT: mov h4, v0.h[4]
; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, eq
+; CHECK-NEXT: shll v4.4s, v4.4h, #16
; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: fcmp s6, s3
; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h5, v1.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, eq
; CHECK-NEXT: csinv w8, w8, wzr, vc
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: shll v5.4s, v5.4h, #16
+; CHECK-NEXT: shll v6.4s, v6.4h, #16
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, eq
; CHECK-NEXT: csinv w8, w8, wzr, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: shll v4.4s, v4.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
; CHECK-NEXT: mov v2.h[4], w8
@@ -1016,11 +1016,11 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ugt:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1030,34 +1030,34 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, hi
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, hi
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, hi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1079,11 +1079,11 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_uge:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1093,34 +1093,34 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, pl
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, pl
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, pl
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, pl
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1142,11 +1142,11 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ult:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1156,34 +1156,34 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, lt
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, lt
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, lt
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, lt
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1205,11 +1205,11 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ule:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1219,34 +1219,34 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, le
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, le
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, le
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, le
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1268,11 +1268,11 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_uno:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1282,34 +1282,34 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, vs
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, vs
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, vs
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, vs
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1331,54 +1331,54 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_one:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: shll v6.4s, v6.4h, #16
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v2.4s, v1.4h, #16
; CHECK-NEXT: shll v3.4s, v0.4h, #16
; CHECK-NEXT: csetm w8, mi
; CHECK-NEXT: csinv w8, w8, wzr, le
; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: shll v3.4s, v4.4h, #16
+; CHECK-NEXT: shll v2.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h3, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[4]
; CHECK-NEXT: csetm w9, mi
; CHECK-NEXT: csinv w9, w9, wzr, le
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
+; CHECK-NEXT: fcmp s4, s2
+; CHECK-NEXT: mov h4, v0.h[4]
; CHECK-NEXT: fmov s2, w9
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, mi
+; CHECK-NEXT: shll v4.4s, v4.4h, #16
; CHECK-NEXT: csinv w8, w8, wzr, le
-; CHECK-NEXT: fcmp s4, s3
+; CHECK-NEXT: fcmp s6, s3
; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h5, v1.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, mi
; CHECK-NEXT: csinv w8, w8, wzr, le
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: mov h4, v0.h[6]
+; CHECK-NEXT: shll v5.4s, v5.4h, #16
+; CHECK-NEXT: shll v6.4s, v6.4h, #16
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, mi
; CHECK-NEXT: csinv w8, w8, wzr, le
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
+; CHECK-NEXT: fcmp s6, s5
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: shll v4.4s, v4.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
; CHECK-NEXT: mov v2.h[4], w8
@@ -1402,11 +1402,11 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_oeq:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1416,34 +1416,34 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, eq
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, eq
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, eq
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1465,11 +1465,11 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ogt:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1479,34 +1479,34 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, gt
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, gt
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, gt
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, gt
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1528,11 +1528,11 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_oge:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1542,34 +1542,34 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, ge
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, ge
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, ge
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, ge
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1591,11 +1591,11 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_olt:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1605,34 +1605,34 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, mi
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, mi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, mi
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, mi
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1654,11 +1654,11 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ole:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1668,34 +1668,34 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, ls
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, ls
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, ls
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, ls
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
@@ -1717,11 +1717,11 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-LABEL: test_fcmp_ord:
; CHECK: // %bb.0:
-; CHECK-NEXT: dup v2.4h, v1.h[1]
-; CHECK-NEXT: dup v3.4h, v0.h[1]
-; CHECK-NEXT: dup v4.4h, v1.h[2]
-; CHECK-NEXT: dup v5.4h, v0.h[2]
-; CHECK-NEXT: dup v6.4h, v0.h[3]
+; CHECK-NEXT: mov h2, v1.h[1]
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: mov h4, v1.h[2]
+; CHECK-NEXT: mov h5, v0.h[2]
+; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: shll v2.4s, v2.4h, #16
; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: fcmp s3, s2
@@ -1731,34 +1731,34 @@ define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
; CHECK-NEXT: fcmp s3, s2
; CHECK-NEXT: shll v3.4s, v4.4h, #16
; CHECK-NEXT: shll v4.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.4h, v1.h[3]
+; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: csetm w9, vc
; CHECK-NEXT: fmov s2, w9
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[4]
-; CHECK-NEXT: dup v6.8h, v0.h[4]
+; CHECK-NEXT: mov h3, v1.h[4]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[4]
; CHECK-NEXT: mov v2.h[1], w8
; CHECK-NEXT: csetm w8, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
+; CHECK-NEXT: mov h5, v1.h[5]
; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[5]
-; CHECK-NEXT: dup v6.8h, v0.h[5]
+; CHECK-NEXT: mov h6, v0.h[5]
; CHECK-NEXT: mov v2.h[2], w8
; CHECK-NEXT: csetm w8, vc
; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
-; CHECK-NEXT: shll v4.4s, v6.4h, #16
-; CHECK-NEXT: dup v5.8h, v1.h[6]
-; CHECK-NEXT: dup v6.8h, v0.h[6]
-; CHECK-NEXT: dup v1.8h, v1.h[7]
-; CHECK-NEXT: dup v0.8h, v0.h[7]
+; CHECK-NEXT: mov h3, v1.h[6]
+; CHECK-NEXT: shll v4.4s, v5.4h, #16
+; CHECK-NEXT: shll v5.4s, v6.4h, #16
+; CHECK-NEXT: mov h6, v0.h[6]
+; CHECK-NEXT: mov h1, v1.h[7]
+; CHECK-NEXT: mov h0, v0.h[7]
; CHECK-NEXT: mov v2.h[3], w8
; CHECK-NEXT: csetm w8, vc
-; CHECK-NEXT: fcmp s4, s3
-; CHECK-NEXT: shll v3.4s, v5.4h, #16
+; CHECK-NEXT: fcmp s5, s4
+; CHECK-NEXT: shll v3.4s, v3.4h, #16
; CHECK-NEXT: shll v4.4s, v6.4h, #16
; CHECK-NEXT: shll v1.4s, v1.4h, #16
; CHECK-NEXT: shll v0.4s, v0.4h, #16
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 381c67c6d749e..da6b3bb99dbda 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -74,30 +74,16 @@ define void @test_copysign_bf16(ptr %ap, ptr %bp) {
;
; NONEON-NOSVE-LABEL: test_copysign_bf16:
; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: sub sp, sp, #80
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80
+; NONEON-NOSVE-NEXT: sub sp, sp, #16
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: ldr h0, [x0]
; NONEON-NOSVE-NEXT: ldr h1, [x1]
-; NONEON-NOSVE-NEXT: str h0, [sp, #40]
-; NONEON-NOSVE-NEXT: ldr d0, [sp, #40]
-; NONEON-NOSVE-NEXT: str h1, [sp, #76]
-; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NONEON-NOSVE-NEXT: str q0, [sp]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #12]
-; NONEON-NOSVE-NEXT: lsl w9, w8, #16
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #8]
-; NONEON-NOSVE-NEXT: lsl w8, w8, #16
-; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24]
-; NONEON-NOSVE-NEXT: ldr w8, [sp, #4]
-; NONEON-NOSVE-NEXT: lsl w9, w8, #16
-; NONEON-NOSVE-NEXT: ldr w8, [sp]
+; NONEON-NOSVE-NEXT: fmov w8, s0
+; NONEON-NOSVE-NEXT: str h1, [sp, #12]
; NONEON-NOSVE-NEXT: lsl w8, w8, #16
-; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16]
-; NONEON-NOSVE-NEXT: ldrb w8, [sp, #77]
-; NONEON-NOSVE-NEXT: ldr q0, [sp, #16]
+; NONEON-NOSVE-NEXT: fmov s0, w8
+; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13]
; NONEON-NOSVE-NEXT: tst w8, #0x80
-; NONEON-NOSVE-NEXT: str q0, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr s0, [sp, #48]
; NONEON-NOSVE-NEXT: fabs s0, s0
; NONEON-NOSVE-NEXT: fneg s1, s0
; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne
@@ -105,7 +91,7 @@ define void @test_copysign_bf16(ptr %ap, ptr %bp) {
; NONEON-NOSVE-NEXT: lsr w8, w8, #16
; NONEON-NOSVE-NEXT: fmov s0, w8
; NONEON-NOSVE-NEXT: str h0, [x0]
-; NONEON-NOSVE-NEXT: add sp, sp, #80
+; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
%a = load bfloat, ptr %ap
%b = load bfloat, ptr %bp
>From 49c750112f3b30bc269fed7814950e067eb82032 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Mon, 28 Apr 2025 12:32:16 +0100
Subject: [PATCH 2/3] Update tests now that PR#131345 has been merged.
---
.../test/CodeGen/AArch64/bf16-instructions.ll | 9 +-
llvm/test/CodeGen/AArch64/bf16_fast_math.ll | 190 +++++++-----------
2 files changed, 76 insertions(+), 123 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index 1dd883580715e..9f002b1e0da55 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -202,16 +202,13 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
;
; CHECK-BF16-LABEL: test_fmadd:
; CHECK-BF16: // %bb.0:
+; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2
; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1
; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0
-; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2
; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16
; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
-; CHECK-BF16-NEXT: fmul s0, s0, s1
-; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16
-; CHECK-BF16-NEXT: bfcvt h0, s0
-; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16
-; CHECK-BF16-NEXT: fadd s0, s0, s1
+; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2
; CHECK-BF16-NEXT: bfcvt h0, s0
; CHECK-BF16-NEXT: ret
%mul = fmul fast bfloat %a, %b
diff --git a/llvm/test/CodeGen/AArch64/bf16_fast_math.ll b/llvm/test/CodeGen/AArch64/bf16_fast_math.ll
index 7d7fb67ca2f77..871ca12c9de77 100644
--- a/llvm/test/CodeGen/AArch64/bf16_fast_math.ll
+++ b/llvm/test/CodeGen/AArch64/bf16_fast_math.ll
@@ -4,8 +4,6 @@
; Check that the output instructions have the same fast math flags as the input
; fadd, even when bf16 is legalized to f32.
-; FIXME: Conversion from float to bf16 is done via a vector type for some
-; reason, when we should just be using scalar instructions.
define bfloat @normal_fadd(bfloat %x, bfloat %y) {
; CHECK-NOBF16-LABEL: name: normal_fadd
@@ -14,13 +12,11 @@ define bfloat @normal_fadd(bfloat %x, bfloat %y) {
; CHECK-NOBF16-NEXT: {{ $}}
; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[FADDSrr]]
@@ -40,13 +36,11 @@ define bfloat @normal_fadd(bfloat %x, bfloat %y) {
; CHECK-BF16-NEXT: {{ $}}
; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr
@@ -64,13 +58,11 @@ define bfloat @fast_fadd(bfloat %x, bfloat %y) {
; CHECK-NOBF16-NEXT: {{ $}}
; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[FADDSrr]]
@@ -90,13 +82,11 @@ define bfloat @fast_fadd(bfloat %x, bfloat %y) {
; CHECK-BF16-NEXT: {{ $}}
; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nnan ninf nsz arcp contract afn reassoc nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr
@@ -114,13 +104,11 @@ define bfloat @ninf_fadd(bfloat %x, bfloat %y) {
; CHECK-NOBF16-NEXT: {{ $}}
; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[FADDSrr]]
@@ -140,13 +128,11 @@ define bfloat @ninf_fadd(bfloat %x, bfloat %y) {
; CHECK-BF16-NEXT: {{ $}}
; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = ninf nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr
@@ -159,8 +145,6 @@ entry:
; Check that when we have the right fast math flags the converts in between the
; two fadds are removed.
-; FIXME: The convert from float to bf16 being done by a shift prevents this from
-; happening.
define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-NOBF16-LABEL: name: normal_fadd_sequence
@@ -170,13 +154,11 @@ define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY5:%[0-9]+]]:gpr32 = COPY [[FADDSrr]]
@@ -187,13 +169,11 @@ define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-NOBF16-NEXT: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri killed [[ADDWrr1]], 16, 31
; CHECK-NOBF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]]
; CHECK-NOBF16-NEXT: [[COPY7:%[0-9]+]]:fpr16 = COPY [[COPY6]].hsub
- ; CHECK-NOBF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[COPY7]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[COPY7]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]]
; CHECK-NOBF16-NEXT: [[COPY8:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]]
; CHECK-NOBF16-NEXT: [[COPY9:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY8]], killed [[COPY9]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY10:%[0-9]+]]:gpr32 = COPY [[FADDSrr1]]
@@ -213,23 +193,19 @@ define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-BF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr
- ; CHECK-BF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[BFCVT]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[BFCVT]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]]
; CHECK-BF16-NEXT: [[COPY5:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub
- ; CHECK-BF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]]
; CHECK-BF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub
; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY5]], killed [[COPY6]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT1:%[0-9]+]]:fpr16 = nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr
@@ -249,13 +225,11 @@ define bfloat @nnan_ninf_contract_fadd_sequence(bfloat %x, bfloat %y, bfloat %z)
; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY5:%[0-9]+]]:gpr32 = COPY [[FADDSrr]]
@@ -266,13 +240,11 @@ define bfloat @nnan_ninf_contract_fadd_sequence(bfloat %x, bfloat %y, bfloat %z)
; CHECK-NOBF16-NEXT: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri killed [[ADDWrr1]], 16, 31
; CHECK-NOBF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]]
; CHECK-NOBF16-NEXT: [[COPY7:%[0-9]+]]:fpr16 = COPY [[COPY6]].hsub
- ; CHECK-NOBF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[COPY7]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[COPY7]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]]
; CHECK-NOBF16-NEXT: [[COPY8:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]]
; CHECK-NOBF16-NEXT: [[COPY9:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY8]], killed [[COPY9]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY10:%[0-9]+]]:gpr32 = COPY [[FADDSrr1]]
@@ -292,27 +264,19 @@ define bfloat @nnan_ninf_contract_fadd_sequence(bfloat %x, bfloat %y, bfloat %z)
; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-BF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr
- ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr
- ; CHECK-BF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[BFCVT]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]]
; CHECK-BF16-NEXT: [[COPY5:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub
- ; CHECK-BF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]]
- ; CHECK-BF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub
- ; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY5]], killed [[COPY6]], implicit $fpcr
- ; CHECK-BF16-NEXT: [[BFCVT1:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr
- ; CHECK-BF16-NEXT: $h0 = COPY [[BFCVT1]]
+ ; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[FADDSrr]], killed [[COPY5]], implicit $fpcr
+ ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr
+ ; CHECK-BF16-NEXT: $h0 = COPY [[BFCVT]]
; CHECK-BF16-NEXT: RET_ReallyLR implicit $h0
entry:
%add1 = fadd nnan ninf contract bfloat %x, %y
@@ -328,13 +292,11 @@ define bfloat @ninf_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY5:%[0-9]+]]:gpr32 = COPY [[FADDSrr]]
@@ -345,13 +307,11 @@ define bfloat @ninf_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-NOBF16-NEXT: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri killed [[ADDWrr1]], 16, 31
; CHECK-NOBF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]]
; CHECK-NOBF16-NEXT: [[COPY7:%[0-9]+]]:fpr16 = COPY [[COPY6]].hsub
- ; CHECK-NOBF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[COPY7]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[COPY7]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]]
; CHECK-NOBF16-NEXT: [[COPY8:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub
- ; CHECK-NOBF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub
- ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]]
+ ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]]
; CHECK-NOBF16-NEXT: [[COPY9:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub
; CHECK-NOBF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY8]], killed [[COPY9]], implicit $fpcr
; CHECK-NOBF16-NEXT: [[COPY10:%[0-9]+]]:gpr32 = COPY [[FADDSrr1]]
@@ -371,23 +331,19 @@ define bfloat @ninf_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) {
; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
- ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]]
; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub
- ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]]
; CHECK-BF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub
; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = ninf nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr
- ; CHECK-BF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[BFCVT]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[BFCVT]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]]
; CHECK-BF16-NEXT: [[COPY5:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub
- ; CHECK-BF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF
- ; CHECK-BF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub
- ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]]
+ ; CHECK-BF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+ ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]]
; CHECK-BF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub
; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY5]], killed [[COPY6]], implicit $fpcr
; CHECK-BF16-NEXT: [[BFCVT1:%[0-9]+]]:fpr16 = ninf nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr
>From 302aa53f0425bc07f9c9d6b508127a42b135ae75 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Mon, 28 Apr 2025 14:28:39 +0100
Subject: [PATCH 3/3] Add an explicit COPY_TO_REGCLASS in the no-NEON pattern
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3562406738c93..9e210f6161f09 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8527,7 +8527,9 @@ def : Pat<(v4f32 (any_fpextend (extract_high_v8bf16 (v8bf16 V128:$Rn)))),
// Fallback pattern for when we don't have NEON
def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
(f32 (COPY_TO_REGCLASS
- (i32 (UBFMWri (i32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)),
+ (i32 (UBFMWri (COPY_TO_REGCLASS
+ (f32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)),
+ GPR32),
(i64 16), (i64 15))),
FPR32))>;
More information about the llvm-commits
mailing list