[llvm] [NVPTX] support packed f32 instructions for sm_100+ (PR #126337)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 19:00:59 PDT 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/126337
>From 0b21c4a2a977f525460666a5a2307eee198f5e0f Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 15:52:25 -0800
Subject: [PATCH 01/25] legalize v2f32 as i64 reg and add test cases
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 1 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 12 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 10 +-
llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td | 4 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 +
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 390 ++++++++++++++++++
6 files changed, 416 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4ce8c508c5f2b..77bf8e63f9fe1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1027,6 +1027,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
case MVT::i32:
return Opcode_i32;
case MVT::i64:
+ case MVT::v2f32:
return Opcode_i64;
case MVT::f16:
case MVT::bf16:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b768725b04256..a57fde442d905 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -290,8 +290,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
// TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
// ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
// vectors.
- if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
- isPowerOf2_32(NumElts)) {
+ if ((Is16bitsType(EltVT.getSimpleVT()) || EltVT == MVT::f32) &&
+ NumElts % 2 == 0 && isPowerOf2_32(NumElts)) {
// Vectors with an even number of f16 elements will be passed to
// us as an array of v2f16/v2bf16 elements. We must match this so we
// stay in sync with Ins/Outs.
@@ -305,6 +305,9 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
case MVT::i16:
EltVT = MVT::v2i16;
break;
+ case MVT::f32:
+ EltVT = MVT::v2f32;
+ break;
default:
llvm_unreachable("Unexpected type");
}
@@ -578,6 +581,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
+ addRegisterClass(MVT::v2f32, &NVPTX::Int64RegsRegClass);
// Conversion to/from FP16/FP16x2 is always legal.
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -840,6 +844,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
if (getOperationAction(Op, MVT::bf16) == Promote)
AddPromotedToType(Op, MVT::bf16, MVT::f32);
+ if (STI.hasF32x2Instructions())
+ setOperationAction(Op, MVT::v2f32, Legal);
}
// On SM80, we select add/mul/sub as fma to avoid promotion to float
@@ -3315,6 +3321,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
// vectors which contain v2f16 or v2bf16 elements. So we must load
// using i32 here and then bitcast back.
LoadVT = MVT::i32;
+ else if (EltVT == MVT::v2f32)
+ LoadVT = MVT::i64;
EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
SDValue VecAddr =
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3c88551d7b23c..14f1f084f87fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -165,6 +165,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
+def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
def True : Predicate<"true">;
def False : Predicate<"false">;
@@ -2631,13 +2632,13 @@ class LastCallArgInstVT<NVPTXRegClass regclass, ValueType vt> :
NVPTXInst<(outs), (ins regclass:$a), "$a",
[(LastCallArg (i32 0), vt:$a)]>;
-def CallArgI64 : CallArgInst<Int64Regs>;
+def CallArgI64 : CallArgInstVT<Int64Regs, i64>;
def CallArgI32 : CallArgInstVT<Int32Regs, i32>;
def CallArgI16 : CallArgInstVT<Int16Regs, i16>;
def CallArgF64 : CallArgInst<Float64Regs>;
def CallArgF32 : CallArgInst<Float32Regs>;
-def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI64 : LastCallArgInstVT<Int64Regs, i64>;
def LastCallArgI32 : LastCallArgInstVT<Int32Regs, i32>;
def LastCallArgI16 : LastCallArgInstVT<Int16Regs, i16>;
def LastCallArgF64 : LastCallArgInst<Float64Regs>;
@@ -3154,6 +3155,9 @@ let hasSideEffects = false in {
def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
(ins Float32Regs:$s1, Float32Regs:$s2),
"mov.b64 \t$d, {{$s1, $s2}};", []>;
+ def V2F32toI64 : NVPTXInst<(outs Int64Regs:$d),
+ (ins Float32Regs:$s1, Float32Regs:$s2),
+ "mov.b64 \t$d, {{$s1, $s2}};", []>;
// unpack a larger int register to a set of smaller int registers
def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
@@ -3218,6 +3222,8 @@ def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
(V2I16toI32 $a, $b)>;
def : Pat<(v2i16 (build_vector i16:$a, i16:$b)),
(V2I16toI32 $a, $b)>;
+def : Pat<(v2f32 (build_vector f32:$a, f32:$b)),
+ (V2F32toI64 $a, $b)>;
def: Pat<(v2i16 (scalar_to_vector i16:$a)),
(CVT_u32_u16 $a, CvtNONE)>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 2011f0f7e328f..7630eefe21182 100644
--- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -62,7 +62,9 @@ def Int16Regs : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4)
def Int32Regs : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8], 32,
(add (sequence "R%u", 0, 4),
VRFrame32, VRFrameLocal32)>;
-def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
+def Int64Regs : NVPTXRegClass<[i64, v2f32], 64,
+ (add (sequence "RL%u", 0, 4),
+ VRFrame64, VRFrameLocal64)>;
// 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only.
def Int128Regs : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>;
def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0a4fc8d1435be..dd617cbb6ab3c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -112,6 +112,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
return HasTcgen05 && PTXVersion >= 86;
}
+ bool hasF32x2Instructions() const {
+ return SmVersion >= 100 && PTXVersion >= 86;
+ }
+
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
new file mode 100644
index 0000000000000..f449fefe5763e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; ## Full FP32x2 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck --check-prefixes=CHECK-O0 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O3 -verify-machineinstrs \
+; RUN: | FileCheck --check-prefixes=CHECK-O3 %s
+; RUN: %if ptxas %{ \
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
+; RUN: -O3 -verify-machineinstrs \
+; RUN: | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "nvptx64-nvidia-cuda"
+
+define <2 x float> @test_ret_const() #0 {
+ ret <2 x float> <float 1.0, float 2.0>
+}
+
+define float @test_extract_0(<2 x float> %a) #0 {
+ %e = extractelement <2 x float> %a, i32 0
+ ret float %e
+}
+
+define float @test_extract_1(<2 x float> %a) #0 {
+ %e = extractelement <2 x float> %a, i32 1
+ ret float %e
+}
+
+; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on
+; test_extract_i_param_0 where the symbol's address is not taken first (that
+; is, moved to a temporary)
+; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+; ; CHECK-LABEL: test_extract_i(
+; ; CHECK: {
+; ; CHECK-NEXT: .reg .pred %p<2>;
+; ; CHECK-NEXT: .reg .f32 %f<4>;
+; ; CHECK-NEXT: .reg .b64 %rd<2>;
+; ; CHECK-EMPTY:
+; ; CHECK-NEXT: // %bb.0:
+; ; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_i_param_0];
+; ; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
+; ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
+; ; CHECK-NEXT: selp.f32 %f3, %f1, %f2, %p1;
+; ; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
+; ; CHECK-NEXT: ret;
+; %e = extractelement <2 x float> %a, i64 %idx
+; ret float %e
+; }
+
+define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fadd <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
+ %r = fadd <2 x float> <float 1.0, float 2.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
+ %r = fadd <2 x float> %a, <float 1.0, float 2.0>
+ ret <2 x float> %r
+}
+
+define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
+ %r = fadd <4 x float> %a, %b
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
+ %r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
+ %r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+ ret <4 x float> %r
+}
+
+define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fsub <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fneg(<2 x float> %a) #0 {
+ %r = fsub <2 x float> <float 0.0, float 0.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fmul <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+ %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fdiv <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
+ %r = frem <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
+ %r = fadd <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
+ %r = fadd <2 x float> <float 1.0, float 2.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
+ %r = fadd <2 x float> %a, <float 1.0, float 2.0>
+ ret <2 x float> %r
+}
+
+define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
+ %r = fadd <4 x float> %a, %b
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
+ %r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
+ ret <4 x float> %r
+}
+
+define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
+ %r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+ ret <4 x float> %r
+}
+
+define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
+ %r = fsub <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
+ %r = fsub <2 x float> <float 0.0, float 0.0>, %a
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
+ %r = fmul <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 {
+ %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
+ %r = fdiv <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
+ %r = frem <2 x float> %a, %b
+ ret <2 x float> %r
+}
+
+define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
+ %t1 = load <2 x float>, ptr %a
+ store <2 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
+ %t1 = load <3 x float>, ptr %a
+ store <3 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
+ %t1 = load <4 x float>, ptr %a
+ store <4 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
+ %t1 = load <8 x float>, ptr %a
+ store <8 x float> %t1, ptr %b, align 32
+ ret void
+}
+
+declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
+
+define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
+ %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
+ %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
+ %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
+ %r = select i1 %c, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
+ %cc = fcmp une <2 x float> %c, %d
+ %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
+ ret <2 x double> %r
+}
+
+define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
+ %cc = fcmp une <2 x double> %c, %d
+ %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %r
+}
+
+define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp une <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ueq <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ugt <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp uge <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ult <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ule <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp uno <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp one <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp oeq <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ogt <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp oge <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp olt <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ole <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
+ %r = fcmp ord <2 x float> %a, %b
+ ret <2 x i1> %r
+}
+
+define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
+ %r = fptosi <2 x float> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
+ %r = fptosi <2 x float> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
+ %r = fptoui <2 x float> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
+ %r = fptoui <2 x float> %a to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+ %r = uitofp <2 x i32> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+ %r = uitofp <2 x i64> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+ %r = sitofp <2 x i32> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+ %r = sitofp <2 x i64> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
+ %c = uitofp <2 x i32> %a to <2 x float>
+ %r = fadd <2 x float> %b, %c
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+ %r = fptrunc <2 x double> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
+ %r = fpext <2 x float> %a to <2 x double>
+ ret <2 x double> %r
+}
+
+define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
+ %r = bitcast <2 x float> %a to <2 x i32>
+ ret <2 x i32> %r
+}
+
+define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+ %r = bitcast <2 x i32> %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
+ %r = bitcast double %a to <2 x float>
+ ret <2 x float> %r
+}
+
+define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
+ %r = bitcast <2 x float> %a to double
+ ret double %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
+attributes #2 = { "denormal-fp-math"="preserve-sign" }
>From 46e0b2e9e392c93c28ed7dedb84586143c77671c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 16:27:33 -0800
Subject: [PATCH 02/25] support fadd, fsub, fmul, fma and load on v2f32
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 12 +++++---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 33 +++++++++++++++++++++
2 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 77bf8e63f9fe1..269f53e4efcdc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1099,10 +1099,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// Vector Setting
unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
if (SimpleVT.isVector()) {
- assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
- "Unexpected vector type");
- // v2f16/v2bf16/v2i16 is loaded using ld.b32
- FromTypeWidth = 32;
+ if (Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8)
+ // v2f16/v2bf16/v2i16 is loaded using ld.b32
+ FromTypeWidth = 32;
+ else if (LoadedVT == MVT::v2f32)
+ // v2f32 is loaded using ld.b64
+ FromTypeWidth = 64;
+ else
+ llvm_unreachable("Unexpected vector type");
}
if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 14f1f084f87fe..8b2fbb7db1b11 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -405,6 +405,18 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
[(set f32:$dst, (OpNode f32:$a, fpimm:$b))]>,
Requires<[allowFMA]>;
+ def f32x2rr_ftz :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".ftz.f32x2 \t$dst, $a, $b;"),
+ [(set v2f32:$dst, (OpNode v2f32:$a, v2f32:$b))]>,
+ Requires<[allowFMA, doF32FTZ]>;
+ def f32x2rr :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".f32x2 \t$dst, $a, $b;"),
+ [(set v2f32:$dst, (OpNode v2f32:$a, v2f32:$b))]>,
+ Requires<[allowFMA]>;
def f16rr_ftz :
NVPTXInst<(outs Int16Regs:$dst),
@@ -529,6 +541,18 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
!strconcat(OpcStr, ".rn.bf16x2 \t$dst, $a, $b;"),
[(set v2bf16:$dst, (OpNode v2bf16:$a, v2bf16:$b))]>,
Requires<[hasBF16Math, noFMA]>;
+ def _rnf32x2rr_ftz :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".rn.ftz.f32x2 \t$dst, $a, $b;"),
+ [(set v2f32:$dst, (OpNode v2f32:$a, v2f32:$b))]>,
+ Requires<[hasF32x2Instructions, noFMA, doF32FTZ]>;
+ def _rnf32x2rr :
+ NVPTXInst<(outs Int64Regs:$dst),
+ (ins Int64Regs:$a, Int64Regs:$b),
+ !strconcat(OpcStr, ".rn.f32x2 \t$dst, $a, $b;"),
+ [(set v2f32:$dst, (OpNode v2f32:$a, v2f32:$b))]>,
+ Requires<[hasF32x2Instructions, noFMA]>;
}
// Template for operations which take two f32 or f64 operands. Provides three
@@ -1432,6 +1456,13 @@ multiclass FMA_BF16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred
Requires<[hasBF16Math, Pred]>;
}
+class FMA_F32x2<string OpcStr, Predicate Pred>
+ : NVPTXInst<(outs Int64Regs:$res),
+ (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+ OpcStr # ".f32x2 \t$res, $a, $b, $c;",
+ [(set v2f32:$res, (fma v2f32:$a, v2f32:$b, v2f32:$c))]>,
+ Requires<[hasF32x2Instructions, Pred]>;
+
defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Int16Regs, doF32FTZ>;
defm FMA16 : FMA_F16<"fma.rn.f16", f16, Int16Regs, True>;
defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Int32Regs, doF32FTZ>;
@@ -1440,6 +1471,8 @@ defm BFMA16 : FMA_BF16<"fma.rn.bf16", bf16, Int16Regs, True>;
defm BFMA16x2 : FMA_BF16<"fma.rn.bf16x2", v2bf16, Int32Regs, True>;
defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
+def FMA32x2_ftz : FMA_F32x2<"fma.rn.ftz", doF32FTZ>;
+def FMA32x2 : FMA_F32x2<"fma.rn", True>;
defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;
// sin/cos
>From 06975a36a4d85108d1e9c3483cc96464b49ea501 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 16:49:15 -0800
Subject: [PATCH 03/25] set proxyreg for v2f32 = bitcast i64
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8b2fbb7db1b11..80e8e560c04fa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2762,6 +2762,8 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 $src)>;
}
+def: Pat<(v2f32 (bitconvert i64:$src)), (ProxyRegI64 $src)>;
+
//
// Load / Store Handling
//
>From 9f13105899392fdf9807c6a7652bdf49ec6275ba Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 17:21:43 -0800
Subject: [PATCH 04/25] handle fdiv and other instructions where v2f32 is
illegal
Requires us to lower EXTRACT_VECTOR_ELT as well.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 14 ++++++++++++++
2 files changed, 15 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a57fde442d905..5e98e8da62181 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -928,6 +928,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
setOperationAction(Op, MVT::f16, Promote);
setOperationAction(Op, MVT::f32, Legal);
+ setOperationAction(Op, MVT::v2f32, Expand);
setOperationAction(Op, MVT::f64, Legal);
setOperationAction(Op, MVT::v2f16, Expand);
setOperationAction(Op, MVT::v2bf16, Expand);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 80e8e560c04fa..851c466077fed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3228,6 +3228,14 @@ let hasSideEffects = false in {
(ins Int64Regs:$s),
"{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
[]>;
+ def I64toF32H : NVPTXInst<(outs Float32Regs:$high),
+ (ins Int64Regs:$s),
+ "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
+ []>;
+ def I64toF32L : NVPTXInst<(outs Float32Regs:$low),
+ (ins Int64Regs:$s),
+ "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
+ []>;
}
@@ -3251,6 +3259,12 @@ def : Pat<(extractelt vt:$src, 0),
def : Pat<(extractelt vt:$src, 1),
(I32toI16H $src)>;
}
+
+def : Pat<(extractelt v2f32:$src, 0),
+ (I64toF32L $src)>;
+def : Pat<(extractelt v2f32:$src, 1),
+ (I64toF32H $src)>;
+
def : Pat<(v2f16 (build_vector f16:$a, f16:$b)),
(V2I16toI32 $a, $b)>;
def : Pat<(v2bf16 (build_vector bf16:$a, bf16:$b)),
>From 8fb68629da20f4520b7a7751519a008363f7ac56 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 17:58:19 -0800
Subject: [PATCH 05/25] ProxyReg v2f32 -> ProxyRegI64
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 851c466077fed..83c4335231dc6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2762,6 +2762,8 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI32 $src)>;
}
+def: Pat<(v2f32 (ProxyReg v2f32:$src)), (ProxyRegI64 $src)>;
+
def: Pat<(v2f32 (bitconvert i64:$src)), (ProxyRegI64 $src)>;
//
>From 6b634ddd75cbf8f810a2f33f097e755f6c8d95c0 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 18:15:52 -0800
Subject: [PATCH 06/25] support select v2f32
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 83c4335231dc6..d41697bcf1196 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -887,6 +887,9 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)),
(SELP_b32rr $a, $b, $p)>;
}
+def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)),
+ (SELP_b64rr $a, $b, $p)>;
+
//-----------------------------------
// Test Instructions
//-----------------------------------
>From 677be9ae9c993bbe77289d692119a3d8aaa27159 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 18:22:15 -0800
Subject: [PATCH 07/25] support v2f32 = bitconvert f64
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index d41697bcf1196..0a032d20c0fd3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2878,6 +2878,8 @@ def: Pat<(vt (bitconvert (f32 Float32Regs:$a))),
def: Pat<(f32 (bitconvert vt:$a)),
(BITCONVERT_32_I2F $a)>;
}
+def: Pat<(v2f32 (bitconvert (f64 Float64Regs:$a))),
+ (BITCONVERT_64_F2I $a)>;
foreach vt = [f16, bf16] in {
def: Pat<(vt (bitconvert i16:$a)),
(vt Int16Regs:$a)>;
>From d264e7340e3f378b788b6a82cda0a61c977cad7e Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 18:40:20 -0800
Subject: [PATCH 08/25] support extract_vector_elt with dynamic indices
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 5 ++++-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 21 ++++---------------
2 files changed, 8 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5e98e8da62181..d04468267afcf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -618,6 +618,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
@@ -2251,7 +2253,8 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return Op;
// Extract individual elements and select one of them.
- assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
+ assert((Isv2x16VT(VectorVT) || VectorVT == MVT::v2f32) &&
+ "Unexpected vector type.");
EVT EltVT = VectorVT.getVectorElementType();
SDLoc dl(Op.getNode());
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index f449fefe5763e..37833a7b9fca5 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -37,23 +37,10 @@ define float @test_extract_1(<2 x float> %a) #0 {
; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on
; test_extract_i_param_0 where the symbol's address is not taken first (that
; is, moved to a temporary)
-; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
-; ; CHECK-LABEL: test_extract_i(
-; ; CHECK: {
-; ; CHECK-NEXT: .reg .pred %p<2>;
-; ; CHECK-NEXT: .reg .f32 %f<4>;
-; ; CHECK-NEXT: .reg .b64 %rd<2>;
-; ; CHECK-EMPTY:
-; ; CHECK-NEXT: // %bb.0:
-; ; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_extract_i_param_0];
-; ; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
-; ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
-; ; CHECK-NEXT: selp.f32 %f3, %f1, %f2, %p1;
-; ; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
-; ; CHECK-NEXT: ret;
-; %e = extractelement <2 x float> %a, i64 %idx
-; ret float %e
-; }
+define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+ %e = extractelement <2 x float> %a, i64 %idx
+ ret float %e
+}
define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
%r = fadd <2 x float> %a, %b
>From e016449efe6e49c2dd9574387d85dad44a35bda0 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Sat, 8 Feb 2025 20:24:54 -0800
Subject: [PATCH 09/25] promote extract_vector_elt nodes to unpacking mov
Also update the test cases.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 21 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 +
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 2078 ++++++++++++++++-
4 files changed, 2093 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 269f53e4efcdc..2b91aba9c9ec4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -450,10 +450,14 @@ bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
SDValue Vector = N->getOperand(0);
- // We only care about 16x2 as it's the only real vector type we
- // need to deal with.
+ // We only care about packed vector types: 16x2 and 32x2.
MVT VT = Vector.getSimpleValueType();
- if (!Isv2x16VT(VT))
+ unsigned NewOpcode = 0;
+ if (Isv2x16VT(VT))
+ NewOpcode = NVPTX::I32toV2I16;
+ else if (VT == MVT::v2f32)
+ NewOpcode = NVPTX::I64toV2F32;
+ else
return false;
// Find and record all uses of this vector that extract element 0 or 1.
SmallVector<SDNode *, 4> E0, E1;
@@ -473,16 +477,19 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
}
}
- // There's no point scattering f16x2 if we only ever access one
+ // There's no point scattering f16x2 or f32x2 if we only ever access one
// element of it.
if (E0.empty() || E1.empty())
return false;
- // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
- // into f16,f16 SplitF16x2(V)
+ // Merge:
+ // (f16 extractelt(V, 0), f16 extractelt(V,1))
+ // -> f16,f16 SplitF16x2(V)
+ // (f32 extractelt(V, 0), f32 extractelt(V,1))
+ // -> f32,f32 SplitF32x2(V)
MVT EltVT = VT.getVectorElementType();
SDNode *ScatterOp =
- CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
+ CurDAG->getMachineNode(NewOpcode, SDLoc(N), EltVT, EltVT, Vector);
for (auto *Node : E0)
ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
for (auto *Node : E1)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d04468267afcf..937f6ab478a34 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5434,10 +5434,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
IsPTXVectorType(VectorVT.getSimpleVT()))
return SDValue(); // Native vector loads already combine nicely w/
// extract_vector_elt.
- // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
+ // Don't mess with singletons or v2*16, v2f32, v4i8 and v8i8 types, we already
// handle them OK.
if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
- VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
+ VectorVT == MVT::v2f32 || VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
return SDValue();
// Don't mess with undef values as sra may be simplified to 0, not undef.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 0a032d20c0fd3..acbc4b369edb7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3212,6 +3212,9 @@ let hasSideEffects = false in {
def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
(ins Int64Regs:$s),
"mov.b64 \t{{$d1, $d2}}, $s;", []>;
+ def I64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+ (ins Int64Regs:$s),
+ "mov.b64 \t{{$d1, $d2}}, $s;", []>;
def I128toV2I64: NVPTXInst<(outs Int64Regs:$d1, Int64Regs:$d2),
(ins Int128Regs:$s),
"mov.b128 \t{{$d1, $d2}}, $s;", []>;
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 37833a7b9fca5..97cde07ed2003 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -21,15 +21,76 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "nvptx64-nvidia-cuda"
define <2 x float> @test_ret_const() #0 {
+; CHECK-O0-LABEL: test_ret_const(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ret_const(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O3-NEXT: ret;
ret <2 x float> <float 1.0, float 2.0>
}
define float @test_extract_0(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_extract_0(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<2>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_extract_0_param_0];
+; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {%f1, tmp}, %rd1; }
+; CHECK-O0-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_extract_0(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f32 %f1, [test_extract_0_param_0];
+; CHECK-O3-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-O3-NEXT: ret;
%e = extractelement <2 x float> %a, i32 0
ret float %e
}
define float @test_extract_1(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_extract_1(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<2>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_extract_1_param_0];
+; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %f1}, %rd1; }
+; CHECK-O0-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_extract_1(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f32 %f1, [test_extract_1_param_0+4];
+; CHECK-O3-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-O3-NEXT: ret;
%e = extractelement <2 x float> %a, i32 1
ret float %e
}
@@ -37,150 +98,936 @@ define float @test_extract_1(<2 x float> %a) #0 {
; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on
; test_extract_i_param_0 where the symbol's address is not taken first (that
; is, moved to a temporary)
-define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
- %e = extractelement <2 x float> %a, i64 %idx
- ret float %e
-}
+; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 {
+; %e = extractelement <2 x float> %a, i64 %idx
+; ret float %e
+; }
define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fadd(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fadd_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_param_0];
+; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fadd_param_0];
+; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fadd_imm_0(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fadd_imm_1(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
}
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fadd_v4(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<11>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_param_1];
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_param_0];
+; CHECK-O0-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
+; CHECK-O0-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_v4(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<11>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_v4_param_1];
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd4, %rd5}, [test_fadd_v4_param_0];
+; CHECK-O3-NEXT: add.rn.f32x2 %rd9, %rd5, %rd2;
+; CHECK-O3-NEXT: add.rn.f32x2 %rd10, %rd4, %rd1;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fadd_imm_0_v4(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0_v4(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
+; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd1, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fadd_imm_1_v4(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1_v4(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
+; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd1, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
}
define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fsub(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fsub_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fsub_param_0];
+; CHECK-O0-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fsub(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fsub_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fsub_param_0];
+; CHECK-O3-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fneg(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fneg(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<2>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fneg_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-O0-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fneg(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<2>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fneg_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-O3-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fmul(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fmul_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fmul_param_0];
+; CHECK-O0-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fmul(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fmul_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fmul_param_0];
+; CHECK-O3-NEXT: mul.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+; CHECK-O0-LABEL: test_fma(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd3, [test_fma_param_2];
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fma_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fma_param_0];
+; CHECK-O0-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fma(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fma_param_2];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fma_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd3, [test_fma_param_0];
+; CHECK-O3-NEXT: fma.rn.f32x2 %rd4, %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}
define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fdiv(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fdiv_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fdiv_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-O0-NEXT: div.rn.f32 %f6, %f3, %f1;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fdiv(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fdiv_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fdiv_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-O3-NEXT: div.rn.f32 %f6, %f3, %f1;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_frem(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<15>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_frem_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_frem_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-O0-NEXT: cvt.rzi.f32.f32 %f6, %f5;
+; CHECK-O0-NEXT: mul.f32 %f7, %f6, %f2;
+; CHECK-O0-NEXT: sub.f32 %f8, %f4, %f7;
+; CHECK-O0-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-O0-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-O0-NEXT: div.rn.f32 %f10, %f3, %f1;
+; CHECK-O0-NEXT: cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-O0-NEXT: mul.f32 %f12, %f11, %f1;
+; CHECK-O0-NEXT: sub.f32 %f13, %f3, %f12;
+; CHECK-O0-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-O0-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f14, %f9};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_frem(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<15>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_frem_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_frem_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-O3-NEXT: cvt.rzi.f32.f32 %f6, %f5;
+; CHECK-O3-NEXT: mul.f32 %f7, %f6, %f2;
+; CHECK-O3-NEXT: sub.f32 %f8, %f4, %f7;
+; CHECK-O3-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-O3-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-O3-NEXT: div.rn.f32 %f10, %f3, %f1;
+; CHECK-O3-NEXT: cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-O3-NEXT: mul.f32 %f12, %f11, %f1;
+; CHECK-O3-NEXT: sub.f32 %f13, %f3, %f12;
+; CHECK-O3-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-O3-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f14, %f9};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-O0-LABEL: test_fadd_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fadd_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_ftz_param_0];
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_ftz_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fadd_ftz_param_0];
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
+; CHECK-O0-LABEL: test_fadd_imm_0_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_ftz_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_ftz_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
+; CHECK-O0-LABEL: test_fadd_imm_1_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_ftz_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_ftz_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
}
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
+; CHECK-O0-LABEL: test_fadd_v4_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<11>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_v4_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<11>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_1];
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd4, %rd5}, [test_fadd_v4_ftz_param_0];
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd9, %rd5, %rd2;
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd10, %rd4, %rd1;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
+; CHECK-O0-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd2, %rd5;
+; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd1, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
+; CHECK-O0-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<9>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<9>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd2, %rd5;
+; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd1, %rd7;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-O3-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
}
define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-O0-LABEL: test_fsub_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fsub_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fsub_ftz_param_0];
+; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fsub_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fsub_ftz_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fsub_ftz_param_0];
+; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
+; CHECK-O0-LABEL: test_fneg_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<2>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fneg_ftz_param_0];
+; CHECK-O0-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fneg_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<2>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fneg_ftz_param_0];
+; CHECK-O3-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-O0-LABEL: test_fmul_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fmul_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fmul_ftz_param_0];
+; CHECK-O0-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fmul_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fmul_ftz_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fmul_ftz_param_0];
+; CHECK-O3-NEXT: mul.rn.ftz.f32x2 %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 {
+; CHECK-O0-LABEL: test_fma_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd3, [test_fma_ftz_param_2];
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fma_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fma_ftz_param_0];
+; CHECK-O0-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fma_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fma_ftz_param_2];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fma_ftz_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd3, [test_fma_ftz_param_0];
+; CHECK-O3-NEXT: fma.rn.ftz.f32x2 %rd4, %rd3, %rd2, %rd1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}
define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-O0-LABEL: test_fdiv_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fdiv_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fdiv_ftz_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fdiv_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fdiv_ftz_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fdiv_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
+; CHECK-O0-LABEL: test_frem_ftz(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<15>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_frem_ftz_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_frem_ftz_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
+; CHECK-O0-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
+; CHECK-O0-NEXT: mul.ftz.f32 %f7, %f6, %f2;
+; CHECK-O0-NEXT: sub.ftz.f32 %f8, %f4, %f7;
+; CHECK-O0-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-O0-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-O0-NEXT: div.rn.ftz.f32 %f10, %f3, %f1;
+; CHECK-O0-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
+; CHECK-O0-NEXT: mul.ftz.f32 %f12, %f11, %f1;
+; CHECK-O0-NEXT: sub.ftz.f32 %f13, %f3, %f12;
+; CHECK-O0-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-O0-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f14, %f9};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_frem_ftz(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<15>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_frem_ftz_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_frem_ftz_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
+; CHECK-O3-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
+; CHECK-O3-NEXT: mul.ftz.f32 %f7, %f6, %f2;
+; CHECK-O3-NEXT: sub.ftz.f32 %f8, %f4, %f7;
+; CHECK-O3-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-O3-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-O3-NEXT: div.rn.ftz.f32 %f10, %f3, %f1;
+; CHECK-O3-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
+; CHECK-O3-NEXT: mul.ftz.f32 %f12, %f11, %f1;
+; CHECK-O3-NEXT: sub.ftz.f32 %f13, %f3, %f12;
+; CHECK-O3-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-O3-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f14, %f9};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
+; CHECK-O0-LABEL: test_ldst_v2f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-O0-NEXT: ld.f64 %rd3, [%rd1];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-O0-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v2f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-O3-NEXT: ld.f64 %rd2, [%rd1];
+; CHECK-O3-NEXT: ld.param.u64 %rd3, [test_ldst_v2f32_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: st.v2.f32 [%rd3], {%f1, %f2};
+; CHECK-O3-NEXT: ret;
%t1 = load <2 x float>, ptr %a
store <2 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
+; CHECK-O0-LABEL: test_ldst_v3f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<2>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-O0-NEXT: ld.u64 %rd3, [%rd1];
+; CHECK-O0-NEXT: ld.f32 %f1, [%rd1+8];
+; CHECK-O0-NEXT: st.f32 [%rd2+8], %f1;
+; CHECK-O0-NEXT: st.u64 [%rd2], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v3f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<2>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-O3-NEXT: ld.u64 %rd2, [%rd1];
+; CHECK-O3-NEXT: ld.f32 %f1, [%rd1+8];
+; CHECK-O3-NEXT: ld.param.u64 %rd3, [test_ldst_v3f32_param_1];
+; CHECK-O3-NEXT: st.f32 [%rd3+8], %f1;
+; CHECK-O3-NEXT: st.u64 [%rd3], %rd2;
+; CHECK-O3-NEXT: ret;
%t1 = load <3 x float>, ptr %a
store <3 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
+; CHECK-O0-LABEL: test_ldst_v4f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-O0-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O0-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v4f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-O3-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-O3-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O3-NEXT: ret;
%t1 = load <4 x float>, ptr %a
store <4 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
+; CHECK-O0-LABEL: test_ldst_v8f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<9>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-O0-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O0-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-O0-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
+; CHECK-O0-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_ldst_v8f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<9>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-O3-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-O3-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-O3-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
+; CHECK-O3-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-O3-NEXT: ret;
%t1 = load <8 x float>, ptr %a
store <8 x float> %t1, ptr %b, align 32
ret void
@@ -189,185 +1036,1408 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_call(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_call_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_call_param_0];
+; CHECK-O0-NEXT: { // callseq 0, 0
+; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O0-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O0-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O0-NEXT: call.uni (retval0),
+; CHECK-O0-NEXT: test_callee,
+; CHECK-O0-NEXT: (
+; CHECK-O0-NEXT: param0,
+; CHECK-O0-NEXT: param1
+; CHECK-O0-NEXT: );
+; CHECK-O0-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-O0-NEXT: } // callseq 0
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_call(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_call_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_call_param_1];
+; CHECK-O3-NEXT: { // callseq 0, 0
+; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O3-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O3-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O3-NEXT: call.uni (retval0),
+; CHECK-O3-NEXT: test_callee,
+; CHECK-O3-NEXT: (
+; CHECK-O3-NEXT: param0,
+; CHECK-O3-NEXT: param1
+; CHECK-O3-NEXT: );
+; CHECK-O3-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-O3-NEXT: } // callseq 0
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_call_flipped(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_call_flipped_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_call_flipped_param_0];
+; CHECK-O0-NEXT: { // callseq 1, 0
+; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O0-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O0-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O0-NEXT: call.uni (retval0),
+; CHECK-O0-NEXT: test_callee,
+; CHECK-O0-NEXT: (
+; CHECK-O0-NEXT: param0,
+; CHECK-O0-NEXT: param1
+; CHECK-O0-NEXT: );
+; CHECK-O0-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-O0-NEXT: } // callseq 1
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_call_flipped(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_call_flipped_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_call_flipped_param_0];
+; CHECK-O3-NEXT: { // callseq 1, 0
+; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O3-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O3-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O3-NEXT: call.uni (retval0),
+; CHECK-O3-NEXT: test_callee,
+; CHECK-O3-NEXT: (
+; CHECK-O3-NEXT: param0,
+; CHECK-O3-NEXT: param1
+; CHECK-O3-NEXT: );
+; CHECK-O3-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-O3-NEXT: } // callseq 1
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_tailcall_flipped(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_tailcall_flipped_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_tailcall_flipped_param_0];
+; CHECK-O0-NEXT: { // callseq 2, 0
+; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O0-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O0-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O0-NEXT: call.uni (retval0),
+; CHECK-O0-NEXT: test_callee,
+; CHECK-O0-NEXT: (
+; CHECK-O0-NEXT: param0,
+; CHECK-O0-NEXT: param1
+; CHECK-O0-NEXT: );
+; CHECK-O0-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-O0-NEXT: } // callseq 2
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_tailcall_flipped(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_tailcall_flipped_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_tailcall_flipped_param_0];
+; CHECK-O3-NEXT: { // callseq 2, 0
+; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-O3-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-O3-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-O3-NEXT: call.uni (retval0),
+; CHECK-O3-NEXT: test_callee,
+; CHECK-O3-NEXT: (
+; CHECK-O3-NEXT: param0,
+; CHECK-O3-NEXT: param1
+; CHECK-O3-NEXT: );
+; CHECK-O3-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-O3-NEXT: } // callseq 2
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
+; CHECK-O0-LABEL: test_select(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<2>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-O0-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-O0-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_param_0];
+; CHECK-O0-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<2>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-O3-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-O3-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_param_0];
+; CHECK-O3-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = select i1 %c, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
+; CHECK-O0-LABEL: test_select_cc(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<11>;
+; CHECK-O0-NEXT: .reg .b64 %rd<6>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd4, [test_select_cc_param_3];
+; CHECK-O0-NEXT: ld.param.f64 %rd3, [test_select_cc_param_2];
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_cc_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_cc_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd4;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-O0-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-O0-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-O0-NEXT: selp.f32 %f9, %f8, %f6, %p2;
+; CHECK-O0-NEXT: selp.f32 %f10, %f7, %f5, %p1;
+; CHECK-O0-NEXT: mov.b64 %rd5, {%f10, %f9};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd5;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select_cc(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<11>;
+; CHECK-O3-NEXT: .reg .b64 %rd<6>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_cc_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_cc_param_1];
+; CHECK-O3-NEXT: ld.param.f64 %rd3, [test_select_cc_param_2];
+; CHECK-O3-NEXT: ld.param.f64 %rd4, [test_select_cc_param_3];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd4;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-O3-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-O3-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-O3-NEXT: selp.f32 %f9, %f8, %f6, %p2;
+; CHECK-O3-NEXT: selp.f32 %f10, %f7, %f5, %p1;
+; CHECK-O3-NEXT: mov.b64 %rd5, {%f10, %f9};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd5;
+; CHECK-O3-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
+; CHECK-O0-LABEL: test_select_cc_f64_f32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-NEXT: .reg .f64 %fd<7>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_cc_f64_f32_param_3];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_cc_f64_f32_param_2];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-O0-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-O0-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
+; CHECK-O0-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
+; CHECK-O0-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select_cc_f64_f32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-NEXT: .reg .f64 %fd<7>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_cc_f64_f32_param_2];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_cc_f64_f32_param_3];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-O3-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-O3-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
+; CHECK-O3-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
+; CHECK-O3-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
+; CHECK-O3-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
ret <2 x double> %r
}
define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
+; CHECK-O0-LABEL: test_select_cc_f32_f64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<7>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-NEXT: .reg .f64 %fd<5>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_cc_f32_f64_param_0];
+; CHECK-O0-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
+; CHECK-O0-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: selp.f32 %f5, %f4, %f2, %p2;
+; CHECK-O0-NEXT: selp.f32 %f6, %f3, %f1, %p1;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_select_cc_f32_f64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<7>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-NEXT: .reg .f64 %fd<5>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_cc_f32_f64_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-O3-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
+; CHECK-O3-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: selp.f32 %f5, %f4, %f2, %p2;
+; CHECK-O3-NEXT: selp.f32 %f6, %f3, %f1, %p1;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%cc = fcmp une <2 x double> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_une(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_une_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_une_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.neu.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.neu.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_une(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_une_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_une_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.neu.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.neu.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp une <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ueq(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.equ.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.equ.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ueq(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.equ.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.equ.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ueq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ugt(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.gtu.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.gtu.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ugt(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.gtu.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.gtu.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ugt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_uge(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.geu.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.geu.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_uge(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.geu.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.geu.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp uge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ult(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.ltu.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.ltu.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ult(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.ltu.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.ltu.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ult <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ule(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.leu.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.leu.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ule(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.leu.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.leu.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ule <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_uno(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.nan.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.nan.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_uno(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.nan.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.nan.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp uno <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_one(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_one_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_one_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.ne.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.ne.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_one(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_one_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_one_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.ne.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.ne.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp one <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_oeq(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.eq.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.eq.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_oeq(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.eq.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.eq.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp oeq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ogt(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.gt.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.gt.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ogt(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.gt.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.gt.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ogt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_oge(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.ge.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.ge.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_oge(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.ge.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.ge.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp oge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_olt(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.lt.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.lt.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_olt(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.lt.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.lt.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp olt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ole(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.le.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.le.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ole(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.le.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.le.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ole <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_fcmp_ord(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .pred %p<3>;
+; CHECK-O0-NEXT: .reg .b16 %rs<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<5>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O0-NEXT: setp.num.f32 %p1, %f4, %f2;
+; CHECK-O0-NEXT: setp.num.f32 %p2, %f3, %f1;
+; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fcmp_ord(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .pred %p<3>;
+; CHECK-O3-NEXT: .reg .b16 %rs<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<5>;
+; CHECK-O3-NEXT: .reg .b64 %rd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-O3-NEXT: setp.num.f32 %p1, %f4, %f2;
+; CHECK-O3-NEXT: setp.num.f32 %p2, %f3, %f1;
+; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-O3-NEXT: ret;
%r = fcmp ord <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fptosi_i32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O0-NEXT: cvt.rzi.s32.f32 %r1, %f2;
+; CHECK-O0-NEXT: cvt.rzi.s32.f32 %r2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptosi_i32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O3-NEXT: cvt.rzi.s32.f32 %r1, %f2;
+; CHECK-O3-NEXT: cvt.rzi.s32.f32 %r2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O3-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fptosi_i64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O0-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
+; CHECK-O0-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptosi_i64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O3-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
+; CHECK-O3-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-O3-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fptoui_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O0-NEXT: cvt.rzi.u32.f32 %r1, %f2;
+; CHECK-O0-NEXT: cvt.rzi.u32.f32 %r2, %f1;
+; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptoui_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O3-NEXT: cvt.rzi.u32.f32 %r1, %f2;
+; CHECK-O3-NEXT: cvt.rzi.u32.f32 %r2, %f1;
+; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O3-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fptoui_2xi64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O0-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
+; CHECK-O0-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
+; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptoui_2xi64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O3-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
+; CHECK-O3-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
+; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-O3-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-O0-LABEL: test_uitofp_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_uitofp_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O3-NEXT: ret;
%r = uitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-O0-LABEL: test_uitofp_2xi64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.u64 %f1, %rd2;
+; CHECK-O0-NEXT: cvt.rn.f32.u64 %f2, %rd1;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_uitofp_2xi64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.u64 %f1, %rd2;
+; CHECK-O3-NEXT: cvt.rn.f32.u64 %f2, %rd1;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = uitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+; CHECK-O0-LABEL: test_sitofp_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.s32 %f1, %r2;
+; CHECK-O0-NEXT: cvt.rn.f32.s32 %f2, %r1;
+; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_sitofp_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.s32 %f1, %r2;
+; CHECK-O3-NEXT: cvt.rn.f32.s32 %f2, %r1;
+; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O3-NEXT: ret;
%r = sitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+; CHECK-O0-LABEL: test_sitofp_2xi64(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.s64 %f1, %rd2;
+; CHECK-O0-NEXT: cvt.rn.f32.s64 %f2, %rd1;
+; CHECK-O0-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_sitofp_2xi64(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.s64 %f1, %rd2;
+; CHECK-O3-NEXT: cvt.rn.f32.s64 %f2, %rd1;
+; CHECK-O3-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%r = sitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
+; CHECK-O0-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<4>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O0-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<4>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-O3-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-O3-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
ret <2 x float> %r
}
define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+; CHECK-O0-LABEL: test_fptrunc_2xdouble(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-NEXT: .reg .f64 %fd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-O0-NEXT: cvt.rn.f32.f64 %f1, %fd2;
+; CHECK-O0-NEXT: cvt.rn.f32.f64 %f2, %fd1;
+; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fptrunc_2xdouble(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-NEXT: .reg .f64 %fd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-O3-NEXT: cvt.rn.f32.f64 %f1, %fd2;
+; CHECK-O3-NEXT: cvt.rn.f32.f64 %f2, %fd1;
+; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O3-NEXT: ret;
%r = fptrunc <2 x double> %a to <2 x float>
ret <2 x float> %r
}
define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_fpext_2xdouble(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .f32 %f<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-NEXT: .reg .f64 %fd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fpext_2xdouble_param_0];
+; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O0-NEXT: cvt.f64.f32 %fd1, %f2;
+; CHECK-O0-NEXT: cvt.f64.f32 %fd2, %f1;
+; CHECK-O0-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_fpext_2xdouble(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f32 %f<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-NEXT: .reg .f64 %fd<3>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fpext_2xdouble_param_0];
+; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-O3-NEXT: cvt.f64.f32 %fd1, %f2;
+; CHECK-O3-NEXT: cvt.f64.f32 %fd2, %f1;
+; CHECK-O3-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-O3-NEXT: ret;
%r = fpext <2 x float> %a to <2 x double>
ret <2 x double> %r
}
define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
+; CHECK-O0-NEXT: cvt.u32.u64 %r2, %rd2;
+; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b32 %r<3>;
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-O3-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
+; CHECK-O3-NEXT: cvt.u32.u64 %r2, %rd1;
+; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-O3-NEXT: ret;
%r = bitcast <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
+; CHECK-O0-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b32 %r<3>;
+; CHECK-O0-NEXT: .reg .b64 %rd<6>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-O0-NEXT: cvt.u64.u32 %rd1, %r1;
+; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r2;
+; CHECK-O0-NEXT: shl.b64 %rd3, %rd2, 32;
+; CHECK-O0-NEXT: or.b64 %rd4, %rd1, %rd3;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O3-NEXT: ret;
%r = bitcast <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
+; CHECK-O0-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<2>;
+; CHECK-O0-NEXT: .reg .f64 %fd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.f64 %fd1, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-O0-NEXT: mov.b64 %rd1, %fd1;
+; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .b64 %rd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-O3-NEXT: ret;
%r = bitcast double %a to <2 x float>
ret <2 x float> %r
}
define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
+; CHECK-O0-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK-O0: {
+; CHECK-O0-NEXT: .reg .b64 %rd<3>;
+; CHECK-O0-NEXT: .reg .f64 %fd<2>;
+; CHECK-O0-EMPTY:
+; CHECK-O0-NEXT: // %bb.0:
+; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-O0-NEXT: mov.b64 %fd1, %rd2;
+; CHECK-O0-NEXT: st.param.f64 [func_retval0], %fd1;
+; CHECK-O0-NEXT: ret;
+;
+; CHECK-O3-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK-O3: {
+; CHECK-O3-NEXT: .reg .f64 %fd<2>;
+; CHECK-O3-EMPTY:
+; CHECK-O3-NEXT: // %bb.0:
+; CHECK-O3-NEXT: ld.param.f64 %fd1, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-O3-NEXT: st.param.f64 [func_retval0], %fd1;
+; CHECK-O3-NEXT: ret;
%r = bitcast <2 x float> %a to double
ret double %r
}
>From 6ca83cc15682b6df65643a0ffd8495f619c63c02 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 17:13:13 -0800
Subject: [PATCH 10/25] [NVPTX] add combiner rule for v2[b]f16 = fp_round v2f32
Now that v2f32 is legal, this node will go straight to instruction
selection. Instead, we want to break it up into two nodes, which can be
handled better in instruction selection, since the final instruction
(cvt.[b]f16x2.f32) takes two f32 arguments.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 38 ++++++++++++++++++++-
1 file changed, 37 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 937f6ab478a34..df518942d47d0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -825,7 +825,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
- ISD::BUILD_VECTOR, ISD::ADDRSPACECAST});
+ ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5582,6 +5582,40 @@ static SDValue combineADDRSPACECAST(SDNode *N,
return SDValue();
}
+static SDValue PerformFP_ROUNDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDLoc DL(N);
+ SDValue Op = N->getOperand(0);
+ SDValue Trunc = N->getOperand(1);
+ EVT NarrowVT = N->getValueType(0);
+ EVT WideVT = Op.getValueType();
+
+ // v2[b]f16 = fp_round (v2f32 A)
+ // -> v2[b]f16 = (build_vector ([b]f16 = fp_round (extractelt A, 0)),
+ // ([b]f16 = fp_round (extractelt A, 1)))
+ if ((NarrowVT == MVT::v2bf16 || NarrowVT == MVT::v2f16) &&
+ WideVT == MVT::v2f32) {
+ SDValue F32Op0, F32Op1;
+ if (Op.getOpcode() == ISD::BUILD_VECTOR) {
+ F32Op0 = Op.getOperand(0);
+ F32Op1 = Op.getOperand(1);
+ } else {
+ F32Op0 = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op,
+ DCI.DAG.getIntPtrConstant(0, DL));
+ F32Op1 = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op,
+ DCI.DAG.getIntPtrConstant(1, DL));
+ }
+ return DCI.DAG.getBuildVector(
+ NarrowVT, DL,
+ {DCI.DAG.getNode(ISD::FP_ROUND, DL, NarrowVT.getScalarType(), F32Op0,
+ Trunc),
+ DCI.DAG.getNode(ISD::FP_ROUND, DL, NarrowVT.getScalarType(), F32Op1,
+ Trunc)});
+ }
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5618,6 +5652,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformBUILD_VECTORCombine(N, DCI);
case ISD::ADDRSPACECAST:
return combineADDRSPACECAST(N, DCI);
+ case ISD::FP_ROUND:
+ return PerformFP_ROUNDCombine(N, DCI);
}
return SDValue();
}
>From 0e3ee1b995f3d5a495baea4b0867524e53941c04 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 17:28:44 -0800
Subject: [PATCH 11/25] [NVPTX] expand fp_extend v2f32
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index df518942d47d0..c635cd7b8c02d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -893,6 +893,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::FP_ROUND, VT, Custom);
}
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
// sm_80 only has conversions between f32 and bf16. Custom lower all other
// bf16 conversions.
>From dfa0918ad8b0822f7acbe207a1204efdde1d1ccf Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 17:45:48 -0800
Subject: [PATCH 12/25] [NVPTX] expand fexp2 and flog2 for v2f32
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c635cd7b8c02d..d1456f66ba139 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -986,6 +986,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
// FLOG2 supports f32 only
// f16/bf16 types aren't supported, but they are promoted/expanded to f32.
@@ -993,7 +994,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::FLOG2, MVT::f32, Legal);
setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
- setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand);
+ setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
+ Expand);
}
setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
>From c3753894e7d6f4d7e633246a725b1d71eadd7cf1 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 17:54:57 -0800
Subject: [PATCH 13/25] [NVPTX] handle v2f32 for LDU/LDG
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2b91aba9c9ec4..85c3add42a6d3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1277,7 +1277,8 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
EltVT = EltVT.getVectorElementType();
// vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
// elements.
- if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
+ if ((EltVT == MVT::f32 && OrigType == MVT::v2f32) ||
+ (EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
(EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
(EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
(EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
>From 901cb3984f4b80c3361890dbab44ecbe8e068d74 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 18:08:15 -0800
Subject: [PATCH 14/25] [NVPTX] only legalze fadd, fsub, fmul, fma for v2f32 on
sm_100+
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d1456f66ba139..e3e93f880d41a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -846,8 +846,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
if (getOperationAction(Op, MVT::bf16) == Promote)
AddPromotedToType(Op, MVT::bf16, MVT::f32);
- if (STI.hasF32x2Instructions())
- setOperationAction(Op, MVT::v2f32, Legal);
+ setOperationAction(Op, MVT::v2f32,
+ STI.hasF32x2Instructions() ? Legal : Expand);
}
// On SM80, we select add/mul/sub as fma to avoid promotion to float
>From f5384e5fc296c16b72877ece5fdcb30fe6c846c1 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 18:12:32 -0800
Subject: [PATCH 15/25] [NVPTX] lower store v2f32 to st.b64
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 85c3add42a6d3..98d157b9d91e3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1426,10 +1426,14 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MVT ScalarVT = SimpleVT.getScalarType();
unsigned ToTypeWidth = ScalarVT.getSizeInBits();
if (SimpleVT.isVector()) {
- assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
- "Unexpected vector type");
- // v2x16 is stored using st.b32
- ToTypeWidth = 32;
+ if (Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8)
+ // v2x16 is stored using st.b32
+ ToTypeWidth = 32;
+ else if (StoreVT == MVT::v2f32)
+ // v2f32 is stored using st.b64
+ ToTypeWidth = 64;
+ else
+ llvm_unreachable("Unexpected vector type");
}
unsigned int ToType = getLdStRegType(ScalarVT);
>From 9a7c82e863ef0e635e1be1ec6fbe5e94c0fd33a1 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 19:13:29 -0800
Subject: [PATCH 16/25] [NVPTX] expand vector_shuffle, insertelt for v2f32 and
lower i64 bitcast
Fixes test/CodeGen/Generic/vector.ll
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 ++
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 +++
2 files changed, 5 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index e3e93f880d41a..43fd9b15d275a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -619,6 +619,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand);
// Custom conversions to/from v2i8.
setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index acbc4b369edb7..08f106b978134 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2896,6 +2896,9 @@ foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
}
}
+def: Pat<(i64 (bitconvert v2f32:$a)),
+ (i64 Int64Regs:$a)>;
+
// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
// we cannot specify floating-point literals in isel patterns. Therefore, we
// use an integer selp to select either 1 (or -1 in case of signed) or 0
>From 0e3e538eae148fd15743cc1d6739e62783a9e7d7 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 11 Feb 2025 19:52:49 -0800
Subject: [PATCH 17/25] [NVPTX] add combiner rule to peek through bitcast of
BUILD_VECTOR
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 48 ++++++++++++++++++++-
1 file changed, 47 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 43fd9b15d275a..8f306883eddca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -827,7 +827,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
- ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND});
+ ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
+ ISD::TRUNCATE});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5621,6 +5622,49 @@ static SDValue PerformFP_ROUNDCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformTRUNCATECombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDLoc DL(N);
+ SDValue Op = N->getOperand(0);
+ EVT FromVT = Op.getValueType();
+ EVT ResultVT = N->getValueType(0);
+
+ if (FromVT == MVT::i64 && ResultVT == MVT::i32) {
+ // i32 = truncate (i64 = bitcast (v2f32 = BUILD_VECTOR (f32 A, f32 B)))
+ // -> i32 = bitcast (f32 A)
+ if (Op.getOpcode() == ISD::BITCAST) {
+ SDValue BV = Op.getOperand(0);
+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+ BV.getValueType() == MVT::v2f32) {
+ // get lower
+ return DCI.DAG.getNode(ISD::BITCAST, DL, ResultVT, BV.getOperand(0));
+ }
+ }
+
+ // i32 = truncate (i64 = srl
+ // (i64 = bitcast
+ // (v2f32 = BUILD_VECTOR (f32 A, f32 B))), 32)
+ // -> i32 = bitcast (f32 B)
+ if (Op.getOpcode() == ISD::SRL) {
+ if (auto *ShAmt = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ ShAmt && ShAmt->getAsAPIntVal() == 32) {
+ SDValue Cast = Op.getOperand(0);
+ if (Cast.getOpcode() == ISD::BITCAST) {
+ SDValue BV = Cast.getOperand(0);
+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+ BV.getValueType() == MVT::v2f32) {
+ // get upper
+ return DCI.DAG.getNode(ISD::BITCAST, DL, ResultVT,
+ BV.getOperand(1));
+ }
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -5659,6 +5703,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return combineADDRSPACECAST(N, DCI);
case ISD::FP_ROUND:
return PerformFP_ROUNDCombine(N, DCI);
+ case ISD::TRUNCATE:
+ return PerformTRUNCATECombine(N, DCI);
}
return SDValue();
}
>From 0515649b7e3a45d1b5c97e558001867ca3d6d8e9 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 13 Feb 2025 13:24:49 -0800
Subject: [PATCH 18/25] [NVPTX] loads, stores of v2f32 are untyped
Ensures ld.b64 and st.b64 for v2f32. Also remove -O3 in
f32x2-instructions.ll test.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 49 +-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 3088 ++++++-----------
2 files changed, 1037 insertions(+), 2100 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 98d157b9d91e3..4f19b4817fc83 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1060,6 +1060,7 @@ static int getLdStRegType(EVT VT) {
case MVT::bf16:
case MVT::v2f16:
case MVT::v2bf16:
+ case MVT::v2f32:
return NVPTX::PTXLdStInstCode::Untyped;
default:
return NVPTX::PTXLdStInstCode::Float;
@@ -1099,24 +1100,27 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
MVT SimpleVT = LoadedVT.getSimpleVT();
MVT ScalarVT = SimpleVT.getScalarType();
- // Read at least 8 bits (predicates are stored as 8-bit values)
- unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
- unsigned int FromType;
// Vector Setting
unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
if (SimpleVT.isVector()) {
- if (Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8)
- // v2f16/v2bf16/v2i16 is loaded using ld.b32
- FromTypeWidth = 32;
- else if (LoadedVT == MVT::v2f32)
- // v2f32 is loaded using ld.b64
- FromTypeWidth = 64;
- else
- llvm_unreachable("Unexpected vector type");
+ switch (LoadedVT.getSimpleVT().SimpleTy) {
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ case MVT::v2i16:
+ case MVT::v4i8:
+ case MVT::v2f32:
+ ScalarVT = LoadedVT.getSimpleVT();
+ break;
+ default:
+ llvm_unreachable("Unsupported vector type for non-vector load");
+ }
}
- if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
+ // Read at least 8 bits (predicates are stored as 8-bit values)
+ unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
+ unsigned int FromType;
+ if (PlainLoad && PlainLoad->getExtensionType() == ISD::SEXTLOAD)
FromType = NVPTX::PTXLdStInstCode::Signed;
else
FromType = getLdStRegType(ScalarVT);
@@ -1424,18 +1428,21 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
MVT ScalarVT = SimpleVT.getScalarType();
- unsigned ToTypeWidth = ScalarVT.getSizeInBits();
if (SimpleVT.isVector()) {
- if (Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8)
- // v2x16 is stored using st.b32
- ToTypeWidth = 32;
- else if (StoreVT == MVT::v2f32)
- // v2f32 is stored using st.b64
- ToTypeWidth = 64;
- else
- llvm_unreachable("Unexpected vector type");
+ switch (StoreVT.getSimpleVT().SimpleTy) {
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ case MVT::v2i16:
+ case MVT::v4i8:
+ case MVT::v2f32:
+ ScalarVT = StoreVT.getSimpleVT();
+ break;
+ default:
+ llvm_unreachable("Unsupported vector type for non-vector store");
+ }
}
+ unsigned ToTypeWidth = ScalarVT.getSizeInBits();
unsigned int ToType = getLdStRegType(ScalarVT);
// Create the machine instruction DAG
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 97cde07ed2003..8f4fd3c6e6ee3 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -2,95 +2,57 @@
; ## Full FP32x2 support enabled by default.
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
-; RUN: | FileCheck --check-prefixes=CHECK-O0 %s
+; RUN: | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas %{ \
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
; RUN: | %ptxas-verify -arch=sm_100 \
; RUN: %}
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
-; RUN: -O3 -verify-machineinstrs \
-; RUN: | FileCheck --check-prefixes=CHECK-O3 %s
-; RUN: %if ptxas %{ \
-; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_100 \
-; RUN: -O3 -verify-machineinstrs \
-; RUN: | %ptxas-verify -arch=sm_100 \
-; RUN: %}
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "nvptx64-nvidia-cuda"
define <2 x float> @test_ret_const() #0 {
-; CHECK-O0-LABEL: test_ret_const(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_ret_const(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_ret_const(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
ret <2 x float> <float 1.0, float 2.0>
}
define float @test_extract_0(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_extract_0(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<2>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_extract_0_param_0];
-; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {%f1, tmp}, %rd1; }
-; CHECK-O0-NEXT: st.param.f32 [func_retval0], %f1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_extract_0(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f32 %f1, [test_extract_0_param_0];
-; CHECK-O3-NEXT: st.param.f32 [func_retval0], %f1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_extract_0(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%f1, tmp}, %rd1; }
+; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT: ret;
%e = extractelement <2 x float> %a, i32 0
ret float %e
}
define float @test_extract_1(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_extract_1(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<2>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_extract_1_param_0];
-; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %f1}, %rd1; }
-; CHECK-O0-NEXT: st.param.f32 [func_retval0], %f1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_extract_1(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f32 %f1, [test_extract_1_param_0+4];
-; CHECK-O3-NEXT: st.param.f32 [func_retval0], %f1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_extract_1(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %f1}, %rd1; }
+; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
+; CHECK-NEXT: ret;
%e = extractelement <2 x float> %a, i32 1
ret float %e
}
@@ -104,930 +66,523 @@ define float @test_extract_1(<2 x float> %a) #0 {
; }
define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fadd(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fadd_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_param_0];
-; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fadd_param_0];
-; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fadd_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_param_0];
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fadd_imm_0(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_0(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fadd_imm_1(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_1(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
}
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fadd_v4(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<11>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_param_1];
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_param_0];
-; CHECK-O0-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
-; CHECK-O0-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_v4(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<11>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_v4_param_1];
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd4, %rd5}, [test_fadd_v4_param_0];
-; CHECK-O3-NEXT: add.rn.f32x2 %rd9, %rd5, %rd2;
-; CHECK-O3-NEXT: add.rn.f32x2 %rd10, %rd4, %rd1;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_v4(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<11>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_param_1];
+; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_param_0];
+; CHECK-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
+; CHECK-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fadd_imm_0_v4(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
-; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_0_v4(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<9>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
-; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd1, %rd7;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_0_v4(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fadd_imm_1_v4(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
-; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_1_v4(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<9>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd6, %rd2, %rd5;
-; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd8, %rd1, %rd7;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_1_v4(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
}
define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fsub(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fsub_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fsub_param_0];
-; CHECK-O0-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fsub(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fsub_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fsub_param_0];
-; CHECK-O3-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fsub(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fsub_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fsub_param_0];
+; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fneg(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fneg(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<2>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fneg_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f00000000;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f1, %f1};
-; CHECK-O0-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fneg(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<2>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fneg_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f00000000;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f1};
-; CHECK-O3-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fneg(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-NEXT: sub.rn.f32x2 %rd3, %rd2, %rd1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fmul(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fmul_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fmul_param_0];
-; CHECK-O0-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fmul(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fmul_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fmul_param_0];
-; CHECK-O3-NEXT: mul.rn.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fmul(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fmul_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fmul_param_0];
+; CHECK-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
-; CHECK-O0-LABEL: test_fma(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<5>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd3, [test_fma_param_2];
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fma_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fma_param_0];
-; CHECK-O0-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd4;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fma(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fma_param_2];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fma_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd3, [test_fma_param_0];
-; CHECK-O3-NEXT: fma.rn.f32x2 %rd4, %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fma(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd3, [test_fma_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fma_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fma_param_0];
+; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}
define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fdiv(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<7>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fdiv_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fdiv_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: div.rn.f32 %f5, %f4, %f2;
-; CHECK-O0-NEXT: div.rn.f32 %f6, %f3, %f1;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fdiv(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fdiv_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fdiv_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: div.rn.f32 %f5, %f4, %f2;
-; CHECK-O3-NEXT: div.rn.f32 %f6, %f3, %f1;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fdiv(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-NEXT: div.rn.f32 %f6, %f3, %f1;
+; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_frem(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<15>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_frem_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_frem_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: div.rn.f32 %f5, %f4, %f2;
-; CHECK-O0-NEXT: cvt.rzi.f32.f32 %f6, %f5;
-; CHECK-O0-NEXT: mul.f32 %f7, %f6, %f2;
-; CHECK-O0-NEXT: sub.f32 %f8, %f4, %f7;
-; CHECK-O0-NEXT: testp.infinite.f32 %p1, %f2;
-; CHECK-O0-NEXT: selp.f32 %f9, %f4, %f8, %p1;
-; CHECK-O0-NEXT: div.rn.f32 %f10, %f3, %f1;
-; CHECK-O0-NEXT: cvt.rzi.f32.f32 %f11, %f10;
-; CHECK-O0-NEXT: mul.f32 %f12, %f11, %f1;
-; CHECK-O0-NEXT: sub.f32 %f13, %f3, %f12;
-; CHECK-O0-NEXT: testp.infinite.f32 %p2, %f1;
-; CHECK-O0-NEXT: selp.f32 %f14, %f3, %f13, %p2;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f14, %f9};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_frem(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<15>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_frem_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_frem_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: div.rn.f32 %f5, %f4, %f2;
-; CHECK-O3-NEXT: cvt.rzi.f32.f32 %f6, %f5;
-; CHECK-O3-NEXT: mul.f32 %f7, %f6, %f2;
-; CHECK-O3-NEXT: sub.f32 %f8, %f4, %f7;
-; CHECK-O3-NEXT: testp.infinite.f32 %p1, %f2;
-; CHECK-O3-NEXT: selp.f32 %f9, %f4, %f8, %p1;
-; CHECK-O3-NEXT: div.rn.f32 %f10, %f3, %f1;
-; CHECK-O3-NEXT: cvt.rzi.f32.f32 %f11, %f10;
-; CHECK-O3-NEXT: mul.f32 %f12, %f11, %f1;
-; CHECK-O3-NEXT: sub.f32 %f13, %f3, %f12;
-; CHECK-O3-NEXT: testp.infinite.f32 %p2, %f1;
-; CHECK-O3-NEXT: selp.f32 %f14, %f3, %f13, %p2;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f14, %f9};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_frem(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<15>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
+; CHECK-NEXT: cvt.rzi.f32.f32 %f6, %f5;
+; CHECK-NEXT: mul.f32 %f7, %f6, %f2;
+; CHECK-NEXT: sub.f32 %f8, %f4, %f7;
+; CHECK-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-NEXT: div.rn.f32 %f10, %f3, %f1;
+; CHECK-NEXT: cvt.rzi.f32.f32 %f11, %f10;
+; CHECK-NEXT: mul.f32 %f12, %f11, %f1;
+; CHECK-NEXT: sub.f32 %f13, %f3, %f12;
+; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-O0-LABEL: test_fadd_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fadd_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_ftz_param_0];
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_ftz_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fadd_ftz_param_0];
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0];
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
-; CHECK-O0-LABEL: test_fadd_imm_0_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_ftz_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_0_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_0_ftz_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_0_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fadd <2 x float> <float 1.0, float 2.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
-; CHECK-O0-LABEL: test_fadd_imm_1_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_ftz_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_1_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fadd_imm_1_ftz_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_1_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40000000;
+; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fadd <2 x float> %a, <float 1.0, float 2.0>
ret <2 x float> %r
}
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
-; CHECK-O0-LABEL: test_fadd_v4_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<11>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_v4_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<11>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_1];
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd4, %rd5}, [test_fadd_v4_ftz_param_0];
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd9, %rd5, %rd2;
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd10, %rd4, %rd1;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_v4_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<11>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
-; CHECK-O0-LABEL: test_fadd_imm_0_v4_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
-; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_0_v4_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<9>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd2, %rd5;
-; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd1, %rd7;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
}
define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
-; CHECK-O0-LABEL: test_fadd_imm_1_v4_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<9>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O0-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O0-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
-; CHECK-O0-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O0-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O0-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O0-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fadd_imm_1_v4_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<9>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f40800000;
-; CHECK-O3-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd6, %rd2, %rd5;
-; CHECK-O3-NEXT: mov.f32 %f3, 0f40000000;
-; CHECK-O3-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-O3-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-O3-NEXT: add.rn.ftz.f32x2 %rd8, %rd1, %rd7;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f40800000;
+; CHECK-NEXT: mov.f32 %f2, 0f40400000;
+; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.f32 %f3, 0f40000000;
+; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
+; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
}
define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-O0-LABEL: test_fsub_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fsub_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fsub_ftz_param_0];
-; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fsub_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fsub_ftz_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fsub_ftz_param_0];
-; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fsub_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0];
+; CHECK-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fsub <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
-; CHECK-O0-LABEL: test_fneg_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<2>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fneg_ftz_param_0];
-; CHECK-O0-NEXT: mov.f32 %f1, 0f00000000;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f1, %f1};
-; CHECK-O0-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fneg_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<2>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fneg_ftz_param_0];
-; CHECK-O3-NEXT: mov.f32 %f1, 0f00000000;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f1, %f1};
-; CHECK-O3-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fneg_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0];
+; CHECK-NEXT: mov.f32 %f1, 0f00000000;
+; CHECK-NEXT: mov.b64 %rd2, {%f1, %f1};
+; CHECK-NEXT: sub.rn.ftz.f32x2 %rd3, %rd2, %rd1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fsub <2 x float> <float 0.0, float 0.0>, %a
ret <2 x float> %r
}
define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-O0-LABEL: test_fmul_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fmul_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fmul_ftz_param_0];
-; CHECK-O0-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fmul_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fmul_ftz_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fmul_ftz_param_0];
-; CHECK-O3-NEXT: mul.rn.ftz.f32x2 %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fmul_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0];
+; CHECK-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fmul <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 {
-; CHECK-O0-LABEL: test_fma_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<5>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd3, [test_fma_ftz_param_2];
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fma_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fma_ftz_param_0];
-; CHECK-O0-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd4;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fma_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fma_ftz_param_2];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fma_ftz_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd3, [test_fma_ftz_param_0];
-; CHECK-O3-NEXT: fma.rn.ftz.f32x2 %rd4, %rd3, %rd2, %rd1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd4;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fma_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0];
+; CHECK-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: ret;
%r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}
define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-O0-LABEL: test_fdiv_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<7>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fdiv_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fdiv_ftz_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
-; CHECK-O0-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fdiv_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fdiv_ftz_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fdiv_ftz_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
-; CHECK-O3-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fdiv_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
+; CHECK-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
+; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
}
define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
-; CHECK-O0-LABEL: test_frem_ftz(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<15>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_frem_ftz_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_frem_ftz_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
-; CHECK-O0-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
-; CHECK-O0-NEXT: mul.ftz.f32 %f7, %f6, %f2;
-; CHECK-O0-NEXT: sub.ftz.f32 %f8, %f4, %f7;
-; CHECK-O0-NEXT: testp.infinite.f32 %p1, %f2;
-; CHECK-O0-NEXT: selp.f32 %f9, %f4, %f8, %p1;
-; CHECK-O0-NEXT: div.rn.ftz.f32 %f10, %f3, %f1;
-; CHECK-O0-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
-; CHECK-O0-NEXT: mul.ftz.f32 %f12, %f11, %f1;
-; CHECK-O0-NEXT: sub.ftz.f32 %f13, %f3, %f12;
-; CHECK-O0-NEXT: testp.infinite.f32 %p2, %f1;
-; CHECK-O0-NEXT: selp.f32 %f14, %f3, %f13, %p2;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f14, %f9};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_frem_ftz(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<15>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_frem_ftz_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_frem_ftz_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
-; CHECK-O3-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
-; CHECK-O3-NEXT: mul.ftz.f32 %f7, %f6, %f2;
-; CHECK-O3-NEXT: sub.ftz.f32 %f8, %f4, %f7;
-; CHECK-O3-NEXT: testp.infinite.f32 %p1, %f2;
-; CHECK-O3-NEXT: selp.f32 %f9, %f4, %f8, %p1;
-; CHECK-O3-NEXT: div.rn.ftz.f32 %f10, %f3, %f1;
-; CHECK-O3-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
-; CHECK-O3-NEXT: mul.ftz.f32 %f12, %f11, %f1;
-; CHECK-O3-NEXT: sub.ftz.f32 %f13, %f3, %f12;
-; CHECK-O3-NEXT: testp.infinite.f32 %p2, %f1;
-; CHECK-O3-NEXT: selp.f32 %f14, %f3, %f13, %p2;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f14, %f9};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_frem_ftz(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<15>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
+; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f6, %f5;
+; CHECK-NEXT: mul.ftz.f32 %f7, %f6, %f2;
+; CHECK-NEXT: sub.ftz.f32 %f8, %f4, %f7;
+; CHECK-NEXT: testp.infinite.f32 %p1, %f2;
+; CHECK-NEXT: selp.f32 %f9, %f4, %f8, %p1;
+; CHECK-NEXT: div.rn.ftz.f32 %f10, %f3, %f1;
+; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f11, %f10;
+; CHECK-NEXT: mul.ftz.f32 %f12, %f11, %f1;
+; CHECK-NEXT: sub.ftz.f32 %f13, %f3, %f12;
+; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
+; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
+; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
}
define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
-; CHECK-O0-LABEL: test_ldst_v2f32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
-; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
-; CHECK-O0-NEXT: ld.f64 %rd3, [%rd1];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd3;
-; CHECK-O0-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_ldst_v2f32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
-; CHECK-O3-NEXT: ld.f64 %rd2, [%rd1];
-; CHECK-O3-NEXT: ld.param.u64 %rd3, [test_ldst_v2f32_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: st.v2.f32 [%rd3], {%f1, %f2};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_ldst_v2f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
+; CHECK-NEXT: ld.b64 %rd3, [%rd1];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
+; CHECK-NEXT: ret;
%t1 = load <2 x float>, ptr %a
store <2 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v3f32(ptr %a, ptr %b) #0 {
-; CHECK-O0-LABEL: test_ldst_v3f32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<2>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
-; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
-; CHECK-O0-NEXT: ld.u64 %rd3, [%rd1];
-; CHECK-O0-NEXT: ld.f32 %f1, [%rd1+8];
-; CHECK-O0-NEXT: st.f32 [%rd2+8], %f1;
-; CHECK-O0-NEXT: st.u64 [%rd2], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_ldst_v3f32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<2>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
-; CHECK-O3-NEXT: ld.u64 %rd2, [%rd1];
-; CHECK-O3-NEXT: ld.f32 %f1, [%rd1+8];
-; CHECK-O3-NEXT: ld.param.u64 %rd3, [test_ldst_v3f32_param_1];
-; CHECK-O3-NEXT: st.f32 [%rd3+8], %f1;
-; CHECK-O3-NEXT: st.u64 [%rd3], %rd2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_ldst_v3f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<2>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f32_param_0];
+; CHECK-NEXT: ld.u64 %rd3, [%rd1];
+; CHECK-NEXT: ld.f32 %f1, [%rd1+8];
+; CHECK-NEXT: st.f32 [%rd2+8], %f1;
+; CHECK-NEXT: st.u64 [%rd2], %rd3;
+; CHECK-NEXT: ret;
%t1 = load <3 x float>, ptr %a
store <3 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v4f32(ptr %a, ptr %b) #0 {
-; CHECK-O0-LABEL: test_ldst_v4f32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
-; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
-; CHECK-O0-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-O0-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_ldst_v4f32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
-; CHECK-O3-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
-; CHECK-O3-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_ldst_v4f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f32_param_0];
+; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT: ret;
%t1 = load <4 x float>, ptr %a
store <4 x float> %t1, ptr %b, align 32
ret void
}
define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
-; CHECK-O0-LABEL: test_ldst_v8f32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<9>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
-; CHECK-O0-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
-; CHECK-O0-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-O0-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
-; CHECK-O0-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-O0-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_ldst_v8f32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<9>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
-; CHECK-O3-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
-; CHECK-O3-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
-; CHECK-O3-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
-; CHECK-O3-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
-; CHECK-O3-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_ldst_v8f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<9>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f32_param_1];
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f32_param_0];
+; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16];
+; CHECK-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8};
+; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4};
+; CHECK-NEXT: ret;
%t1 = load <8 x float>, ptr %a
store <8 x float> %t1, ptr %b, align 32
ret void
@@ -1036,1408 +591,783 @@ define void @test_ldst_v8f32(ptr %a, ptr %b) #0 {
declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0
define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_call(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<5>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_call_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_call_param_0];
-; CHECK-O0-NEXT: { // callseq 0, 0
-; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-O0-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-O0-NEXT: st.param.b64 [param1], %rd2;
-; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-O0-NEXT: call.uni (retval0),
-; CHECK-O0-NEXT: test_callee,
-; CHECK-O0-NEXT: (
-; CHECK-O0-NEXT: param0,
-; CHECK-O0-NEXT: param1
-; CHECK-O0-NEXT: );
-; CHECK-O0-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-O0-NEXT: } // callseq 0
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_call(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_call_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_call_param_1];
-; CHECK-O3-NEXT: { // callseq 0, 0
-; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-O3-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-O3-NEXT: st.param.b64 [param1], %rd2;
-; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-O3-NEXT: call.uni (retval0),
-; CHECK-O3-NEXT: test_callee,
-; CHECK-O3-NEXT: (
-; CHECK-O3-NEXT: param0,
-; CHECK-O3-NEXT: param1
-; CHECK-O3-NEXT: );
-; CHECK-O3-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-O3-NEXT: } // callseq 0
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_call(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0];
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: st.param.b64 [param1], %rd2;
+; CHECK-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b)
ret <2 x float> %r
}
define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_call_flipped(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<5>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_call_flipped_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_call_flipped_param_0];
-; CHECK-O0-NEXT: { // callseq 1, 0
-; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-O0-NEXT: st.param.b64 [param0], %rd2;
-; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-O0-NEXT: st.param.b64 [param1], %rd1;
-; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-O0-NEXT: call.uni (retval0),
-; CHECK-O0-NEXT: test_callee,
-; CHECK-O0-NEXT: (
-; CHECK-O0-NEXT: param0,
-; CHECK-O0-NEXT: param1
-; CHECK-O0-NEXT: );
-; CHECK-O0-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-O0-NEXT: } // callseq 1
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_call_flipped(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_call_flipped_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_call_flipped_param_0];
-; CHECK-O3-NEXT: { // callseq 1, 0
-; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-O3-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-O3-NEXT: st.param.b64 [param1], %rd2;
-; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-O3-NEXT: call.uni (retval0),
-; CHECK-O3-NEXT: test_callee,
-; CHECK-O3-NEXT: (
-; CHECK-O3-NEXT: param0,
-; CHECK-O3-NEXT: param1
-; CHECK-O3-NEXT: );
-; CHECK-O3-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-O3-NEXT: } // callseq 1
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_call_flipped(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0];
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_tailcall_flipped(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<5>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_tailcall_flipped_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_tailcall_flipped_param_0];
-; CHECK-O0-NEXT: { // callseq 2, 0
-; CHECK-O0-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-O0-NEXT: st.param.b64 [param0], %rd2;
-; CHECK-O0-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-O0-NEXT: st.param.b64 [param1], %rd1;
-; CHECK-O0-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-O0-NEXT: call.uni (retval0),
-; CHECK-O0-NEXT: test_callee,
-; CHECK-O0-NEXT: (
-; CHECK-O0-NEXT: param0,
-; CHECK-O0-NEXT: param1
-; CHECK-O0-NEXT: );
-; CHECK-O0-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-O0-NEXT: } // callseq 2
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_tailcall_flipped(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<5>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_tailcall_flipped_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_tailcall_flipped_param_0];
-; CHECK-O3-NEXT: { // callseq 2, 0
-; CHECK-O3-NEXT: .param .align 8 .b8 param0[8];
-; CHECK-O3-NEXT: st.param.b64 [param0], %rd1;
-; CHECK-O3-NEXT: .param .align 8 .b8 param1[8];
-; CHECK-O3-NEXT: st.param.b64 [param1], %rd2;
-; CHECK-O3-NEXT: .param .align 8 .b8 retval0[8];
-; CHECK-O3-NEXT: call.uni (retval0),
-; CHECK-O3-NEXT: test_callee,
-; CHECK-O3-NEXT: (
-; CHECK-O3-NEXT: param0,
-; CHECK-O3-NEXT: param1
-; CHECK-O3-NEXT: );
-; CHECK-O3-NEXT: ld.param.b64 %rd3, [retval0];
-; CHECK-O3-NEXT: } // callseq 2
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
+; CHECK-NEXT: { // callseq 2, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.b64 [param0], %rd2;
+; CHECK-NEXT: .param .align 8 .b8 param1[8];
+; CHECK-NEXT: st.param.b64 [param1], %rd1;
+; CHECK-NEXT: .param .align 8 .b8 retval0[8];
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: test_callee,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.b64 %rd3, [retval0];
+; CHECK-NEXT: } // callseq 2
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a)
ret <2 x float> %r
}
define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 {
-; CHECK-O0-LABEL: test_select(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<2>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u8 %rs1, [test_select_param_2];
-; CHECK-O0-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-O0-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_param_0];
-; CHECK-O0-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_select(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<2>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.u8 %rs1, [test_select_param_2];
-; CHECK-O3-NEXT: and.b16 %rs2, %rs1, 1;
-; CHECK-O3-NEXT: setp.eq.b16 %p1, %rs2, 1;
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_param_0];
-; CHECK-O3-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_select(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
+; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0];
+; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = select i1 %c, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 {
-; CHECK-O0-LABEL: test_select_cc(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<11>;
-; CHECK-O0-NEXT: .reg .b64 %rd<6>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd4, [test_select_cc_param_3];
-; CHECK-O0-NEXT: ld.param.f64 %rd3, [test_select_cc_param_2];
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_cc_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_cc_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd4;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd3;
-; CHECK-O0-NEXT: setp.neu.f32 %p1, %f3, %f1;
-; CHECK-O0-NEXT: setp.neu.f32 %p2, %f4, %f2;
-; CHECK-O0-NEXT: mov.b64 {%f5, %f6}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f7, %f8}, %rd1;
-; CHECK-O0-NEXT: selp.f32 %f9, %f8, %f6, %p2;
-; CHECK-O0-NEXT: selp.f32 %f10, %f7, %f5, %p1;
-; CHECK-O0-NEXT: mov.b64 %rd5, {%f10, %f9};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd5;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_select_cc(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<11>;
-; CHECK-O3-NEXT: .reg .b64 %rd<6>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_cc_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_cc_param_1];
-; CHECK-O3-NEXT: ld.param.f64 %rd3, [test_select_cc_param_2];
-; CHECK-O3-NEXT: ld.param.f64 %rd4, [test_select_cc_param_3];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd4;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd3;
-; CHECK-O3-NEXT: setp.neu.f32 %p1, %f3, %f1;
-; CHECK-O3-NEXT: setp.neu.f32 %p2, %f4, %f2;
-; CHECK-O3-NEXT: mov.b64 {%f5, %f6}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f7, %f8}, %rd1;
-; CHECK-O3-NEXT: selp.f32 %f9, %f8, %f6, %p2;
-; CHECK-O3-NEXT: selp.f32 %f10, %f7, %f5, %p1;
-; CHECK-O3-NEXT: mov.b64 %rd5, {%f10, %f9};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd5;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_select_cc(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<11>;
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
+; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd4;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd3;
+; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-NEXT: mov.b64 {%f5, %f6}, %rd2;
+; CHECK-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-NEXT: selp.f32 %f9, %f8, %f6, %p2;
+; CHECK-NEXT: selp.f32 %f10, %f7, %f5, %p1;
+; CHECK-NEXT: mov.b64 %rd5, {%f10, %f9};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 {
-; CHECK-O0-LABEL: test_select_cc_f64_f32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-NEXT: .reg .f64 %fd<7>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
-; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_cc_f64_f32_param_3];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_cc_f64_f32_param_2];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.neu.f32 %p1, %f3, %f1;
-; CHECK-O0-NEXT: setp.neu.f32 %p2, %f4, %f2;
-; CHECK-O0-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
-; CHECK-O0-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
-; CHECK-O0-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_select_cc_f64_f32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-NEXT: .reg .f64 %fd<7>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_cc_f64_f32_param_2];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_cc_f64_f32_param_3];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.neu.f32 %p1, %f3, %f1;
-; CHECK-O3-NEXT: setp.neu.f32 %p2, %f4, %f2;
-; CHECK-O3-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
-; CHECK-O3-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
-; CHECK-O3-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
-; CHECK-O3-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_select_cc_f64_f32(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f64_f32_param_1];
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f64_f32_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f64_f32_param_3];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f64_f32_param_2];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
+; CHECK-NEXT: selp.f64 %fd5, %fd2, %fd4, %p2;
+; CHECK-NEXT: selp.f64 %fd6, %fd1, %fd3, %p1;
+; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd6, %fd5};
+; CHECK-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b
ret <2 x double> %r
}
define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 {
-; CHECK-O0-LABEL: test_select_cc_f32_f64(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<7>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-NEXT: .reg .f64 %fd<5>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
-; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_select_cc_f32_f64_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_select_cc_f32_f64_param_0];
-; CHECK-O0-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
-; CHECK-O0-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: selp.f32 %f5, %f4, %f2, %p2;
-; CHECK-O0-NEXT: selp.f32 %f6, %f3, %f1, %p1;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_select_cc_f32_f64(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<7>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-NEXT: .reg .f64 %fd<5>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_select_cc_f32_f64_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_select_cc_f32_f64_param_1];
-; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
-; CHECK-O3-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
-; CHECK-O3-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
-; CHECK-O3-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: selp.f32 %f5, %f4, %f2, %p2;
-; CHECK-O3-NEXT: selp.f32 %f6, %f3, %f1, %p1;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_select_cc_f32_f64(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .f32 %f<7>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .f64 %fd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f64 {%fd3, %fd4}, [test_select_cc_f32_f64_param_3];
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_select_cc_f32_f64_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
+; CHECK-NEXT: setp.neu.f64 %p1, %fd1, %fd3;
+; CHECK-NEXT: setp.neu.f64 %p2, %fd2, %fd4;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: selp.f32 %f5, %f4, %f2, %p2;
+; CHECK-NEXT: selp.f32 %f6, %f3, %f1, %p1;
+; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%cc = fcmp une <2 x double> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
ret <2 x float> %r
}
define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_une(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_une_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_une_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.neu.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.neu.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_une(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_une_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_une_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.neu.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.neu.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_une(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.neu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.neu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp une <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ueq(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ueq_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ueq_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.equ.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.equ.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ueq(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ueq_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ueq_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.equ.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.equ.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.equ.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.equ.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ueq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ugt(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ugt_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ugt_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.gtu.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.gtu.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ugt(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ugt_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ugt_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.gtu.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.gtu.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.gtu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.gtu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ugt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_uge(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_uge_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_uge_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.geu.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.geu.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_uge(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_uge_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_uge_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.geu.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.geu.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.geu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.geu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp uge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ult(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ult_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ult_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.ltu.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.ltu.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ult(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ult_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ult_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.ltu.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.ltu.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.ltu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.ltu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ult <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ule(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ule_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ule_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.leu.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.leu.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ule(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ule_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ule_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.leu.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.leu.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.leu.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.leu.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ule <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_uno(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_uno_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_uno_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.nan.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.nan.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_uno(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_uno_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_uno_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.nan.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.nan.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.nan.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.nan.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp uno <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_one(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_one_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_one_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.ne.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.ne.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_one(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_one_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_one_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.ne.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.ne.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_one(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.ne.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.ne.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp one <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_oeq(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_oeq_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_oeq_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.eq.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.eq.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_oeq(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_oeq_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_oeq_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.eq.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.eq.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.eq.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.eq.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp oeq <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ogt(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ogt_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ogt_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.gt.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.gt.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ogt(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ogt_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ogt_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.gt.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.gt.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.gt.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.gt.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ogt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_oge(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_oge_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_oge_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.ge.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.ge.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_oge(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_oge_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_oge_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.ge.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.ge.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.ge.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.ge.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp oge <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_olt(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_olt_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_olt_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.lt.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.lt.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_olt(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_olt_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_olt_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.lt.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.lt.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.lt.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.lt.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp olt <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ole(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ole_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ole_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.le.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.le.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ole(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ole_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ole_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.le.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.le.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ole(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.le.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.le.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ole <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_fcmp_ord(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .pred %p<3>;
-; CHECK-O0-NEXT: .reg .b16 %rs<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<5>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd2, [test_fcmp_ord_param_1];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fcmp_ord_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O0-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O0-NEXT: setp.num.f32 %p1, %f4, %f2;
-; CHECK-O0-NEXT: setp.num.f32 %p2, %f3, %f1;
-; CHECK-O0-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O0-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O0-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fcmp_ord(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .pred %p<3>;
-; CHECK-O3-NEXT: .reg .b16 %rs<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<5>;
-; CHECK-O3-NEXT: .reg .b64 %rd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fcmp_ord_param_0];
-; CHECK-O3-NEXT: ld.param.f64 %rd2, [test_fcmp_ord_param_1];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd2;
-; CHECK-O3-NEXT: mov.b64 {%f3, %f4}, %rd1;
-; CHECK-O3-NEXT: setp.num.f32 %p1, %f4, %f2;
-; CHECK-O3-NEXT: setp.num.f32 %p2, %f3, %f1;
-; CHECK-O3-NEXT: selp.u16 %rs1, -1, 0, %p2;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0], %rs1;
-; CHECK-O3-NEXT: selp.u16 %rs2, -1, 0, %p1;
-; CHECK-O3-NEXT: st.param.b8 [func_retval0+1], %rs2;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<3>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.num.f32 %p1, %f4, %f2;
+; CHECK-NEXT: setp.num.f32 %p2, %f3, %f1;
+; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p2;
+; CHECK-NEXT: st.param.b8 [func_retval0], %rs1;
+; CHECK-NEXT: selp.u16 %rs2, -1, 0, %p1;
+; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2;
+; CHECK-NEXT: ret;
%r = fcmp ord <2 x float> %a, %b
ret <2 x i1> %r
}
define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fptosi_i32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptosi_i32_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O0-NEXT: cvt.rzi.s32.f32 %r1, %f2;
-; CHECK-O0-NEXT: cvt.rzi.s32.f32 %r2, %f1;
-; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fptosi_i32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptosi_i32_param_0];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O3-NEXT: cvt.rzi.s32.f32 %r1, %f2;
-; CHECK-O3-NEXT: cvt.rzi.s32.f32 %r2, %f1;
-; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.s32.f32 %r1, %f2;
+; CHECK-NEXT: cvt.rzi.s32.f32 %r2, %f1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fptosi_i64(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptosi_i64_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O0-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
-; CHECK-O0-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fptosi_i64(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptosi_i64_param_0];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O3-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
-; CHECK-O3-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %f2;
+; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %f1;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ret;
%r = fptosi <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fptoui_2xi32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi32_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O0-NEXT: cvt.rzi.u32.f32 %r1, %f2;
-; CHECK-O0-NEXT: cvt.rzi.u32.f32 %r2, %f1;
-; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fptoui_2xi32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi32_param_0];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O3-NEXT: cvt.rzi.u32.f32 %r1, %f2;
-; CHECK-O3-NEXT: cvt.rzi.u32.f32 %r2, %f1;
-; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.u32.f32 %r1, %f2;
+; CHECK-NEXT: cvt.rzi.u32.f32 %r2, %f1;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fptoui_2xi64(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi64_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O0-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
-; CHECK-O0-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
-; CHECK-O0-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fptoui_2xi64(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fptoui_2xi64_param_0];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O3-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
-; CHECK-O3-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
-; CHECK-O3-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %f2;
+; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %f1;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-NEXT: ret;
%r = fptoui <2 x float> %a to <2 x i64>
ret <2 x i64> %r
}
define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
-; CHECK-O0-LABEL: test_uitofp_2xi32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
-; CHECK-O0-NEXT: cvt.rn.f32.u32 %f1, %r2;
-; CHECK-O0-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_uitofp_2xi32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
-; CHECK-O3-NEXT: cvt.rn.f32.u32 %f1, %r2;
-; CHECK-O3-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
+; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
%r = uitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
-; CHECK-O0-LABEL: test_uitofp_2xi64(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
-; CHECK-O0-NEXT: cvt.rn.f32.u64 %f1, %rd2;
-; CHECK-O0-NEXT: cvt.rn.f32.u64 %f2, %rd1;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f2, %f1};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_uitofp_2xi64(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
-; CHECK-O3-NEXT: cvt.rn.f32.u64 %f1, %rd2;
-; CHECK-O3-NEXT: cvt.rn.f32.u64 %f2, %rd1;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f2, %f1};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
+; CHECK-NEXT: cvt.rn.f32.u64 %f1, %rd2;
+; CHECK-NEXT: cvt.rn.f32.u64 %f2, %rd1;
+; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = uitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
-; CHECK-O0-LABEL: test_sitofp_2xi32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
-; CHECK-O0-NEXT: cvt.rn.f32.s32 %f1, %r2;
-; CHECK-O0-NEXT: cvt.rn.f32.s32 %f2, %r1;
-; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_sitofp_2xi32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
-; CHECK-O3-NEXT: cvt.rn.f32.s32 %f1, %r2;
-; CHECK-O3-NEXT: cvt.rn.f32.s32 %f2, %r1;
-; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
+; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2;
+; CHECK-NEXT: cvt.rn.f32.s32 %f2, %r1;
+; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
%r = sitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
-; CHECK-O0-LABEL: test_sitofp_2xi64(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
-; CHECK-O0-NEXT: cvt.rn.f32.s64 %f1, %rd2;
-; CHECK-O0-NEXT: cvt.rn.f32.s64 %f2, %rd1;
-; CHECK-O0-NEXT: mov.b64 %rd3, {%f2, %f1};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_sitofp_2xi64(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
-; CHECK-O3-NEXT: cvt.rn.f32.s64 %f1, %rd2;
-; CHECK-O3-NEXT: cvt.rn.f32.s64 %f2, %rd1;
-; CHECK-O3-NEXT: mov.b64 %rd3, {%f2, %f1};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
+; CHECK-NEXT: cvt.rn.f32.s64 %f1, %rd2;
+; CHECK-NEXT: cvt.rn.f32.s64 %f2, %rd1;
+; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%r = sitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
-; CHECK-O0-LABEL: test_uitofp_2xi32_fadd(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<4>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-O0-NEXT: cvt.rn.f32.u32 %f1, %r2;
-; CHECK-O0-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-O0-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O0-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_uitofp_2xi32_fadd(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<4>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-O3-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
-; CHECK-O3-NEXT: cvt.rn.f32.u32 %f1, %r2;
-; CHECK-O3-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-O3-NEXT: mov.b64 %rd2, {%f2, %f1};
-; CHECK-O3-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
+; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
+; CHECK-NEXT: mov.b64 %rd2, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%c = uitofp <2 x i32> %a to <2 x float>
%r = fadd <2 x float> %b, %c
ret <2 x float> %r
}
define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
-; CHECK-O0-LABEL: test_fptrunc_2xdouble(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-NEXT: .reg .f64 %fd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
-; CHECK-O0-NEXT: cvt.rn.f32.f64 %f1, %fd2;
-; CHECK-O0-NEXT: cvt.rn.f32.f64 %f2, %fd1;
-; CHECK-O0-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fptrunc_2xdouble(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-NEXT: .reg .f64 %fd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
-; CHECK-O3-NEXT: cvt.rn.f32.f64 %f1, %fd2;
-; CHECK-O3-NEXT: cvt.rn.f32.f64 %f2, %fd1;
-; CHECK-O3-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
+; CHECK-NEXT: cvt.rn.f32.f64 %f1, %fd2;
+; CHECK-NEXT: cvt.rn.f32.f64 %f2, %fd1;
+; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
%r = fptrunc <2 x double> %a to <2 x float>
ret <2 x float> %r
}
define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_fpext_2xdouble(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .f32 %f<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-NEXT: .reg .f64 %fd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %rd1, [test_fpext_2xdouble_param_0];
-; CHECK-O0-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O0-NEXT: cvt.f64.f32 %fd1, %f2;
-; CHECK-O0-NEXT: cvt.f64.f32 %fd2, %f1;
-; CHECK-O0-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_fpext_2xdouble(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f32 %f<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-NEXT: .reg .f64 %fd<3>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_fpext_2xdouble_param_0];
-; CHECK-O3-NEXT: mov.b64 {%f1, %f2}, %rd1;
-; CHECK-O3-NEXT: cvt.f64.f32 %fd1, %f2;
-; CHECK-O3-NEXT: cvt.f64.f32 %fd2, %f1;
-; CHECK-O3-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
+; CHECK-NEXT: cvt.f64.f32 %fd1, %f2;
+; CHECK-NEXT: cvt.f64.f32 %fd2, %f1;
+; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
+; CHECK-NEXT: ret;
%r = fpext <2 x float> %a to <2 x double>
ret <2 x double> %r
}
define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_bitcast_2xfloat_to_2xi32(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_bitcast_2xfloat_to_2xi32_param_0];
-; CHECK-O0-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
-; CHECK-O0-NEXT: cvt.u32.u64 %r2, %rd2;
-; CHECK-O0-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_bitcast_2xfloat_to_2xi32(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b32 %r<3>;
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.u64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0];
-; CHECK-O3-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
-; CHECK-O3-NEXT: cvt.u32.u64 %r2, %rd1;
-; CHECK-O3-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_bitcast_2xfloat_to_2xi32_param_0];
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd2; }
+; CHECK-NEXT: cvt.u32.u64 %r2, %rd2;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT: ret;
%r = bitcast <2 x float> %a to <2 x i32>
ret <2 x i32> %r
}
define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 {
-; CHECK-O0-LABEL: test_bitcast_2xi32_to_2xfloat(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b32 %r<3>;
-; CHECK-O0-NEXT: .reg .b64 %rd<6>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
-; CHECK-O0-NEXT: cvt.u64.u32 %rd1, %r1;
-; CHECK-O0-NEXT: cvt.u64.u32 %rd2, %r2;
-; CHECK-O0-NEXT: shl.b64 %rd3, %rd2, 32;
-; CHECK-O0-NEXT: or.b64 %rd4, %rd1, %rd3;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd4;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_bitcast_2xi32_to_2xfloat(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0];
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0];
+; CHECK-NEXT: cvt.u64.u32 %rd1, %r1;
+; CHECK-NEXT: cvt.u64.u32 %rd2, %r2;
+; CHECK-NEXT: shl.b64 %rd3, %rd2, 32;
+; CHECK-NEXT: or.b64 %rd4, %rd1, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT: ret;
%r = bitcast <2 x i32> %a to <2 x float>
ret <2 x float> %r
}
define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 {
-; CHECK-O0-LABEL: test_bitcast_double_to_2xfloat(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<2>;
-; CHECK-O0-NEXT: .reg .f64 %fd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.f64 %fd1, [test_bitcast_double_to_2xfloat_param_0];
-; CHECK-O0-NEXT: mov.b64 %rd1, %fd1;
-; CHECK-O0-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_bitcast_double_to_2xfloat(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .b64 %rd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %rd1, [test_bitcast_double_to_2xfloat_param_0];
-; CHECK-O3-NEXT: st.param.b64 [func_retval0], %rd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_bitcast_double_to_2xfloat(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-NEXT: .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.f64 %fd1, [test_bitcast_double_to_2xfloat_param_0];
+; CHECK-NEXT: mov.b64 %rd1, %fd1;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: ret;
%r = bitcast double %a to <2 x float>
ret <2 x float> %r
}
define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 {
-; CHECK-O0-LABEL: test_bitcast_2xfloat_to_double(
-; CHECK-O0: {
-; CHECK-O0-NEXT: .reg .b64 %rd<3>;
-; CHECK-O0-NEXT: .reg .f64 %fd<2>;
-; CHECK-O0-EMPTY:
-; CHECK-O0-NEXT: // %bb.0:
-; CHECK-O0-NEXT: ld.param.u64 %rd2, [test_bitcast_2xfloat_to_double_param_0];
-; CHECK-O0-NEXT: mov.b64 %fd1, %rd2;
-; CHECK-O0-NEXT: st.param.f64 [func_retval0], %fd1;
-; CHECK-O0-NEXT: ret;
-;
-; CHECK-O3-LABEL: test_bitcast_2xfloat_to_double(
-; CHECK-O3: {
-; CHECK-O3-NEXT: .reg .f64 %fd<2>;
-; CHECK-O3-EMPTY:
-; CHECK-O3-NEXT: // %bb.0:
-; CHECK-O3-NEXT: ld.param.f64 %fd1, [test_bitcast_2xfloat_to_double_param_0];
-; CHECK-O3-NEXT: st.param.f64 [func_retval0], %fd1;
-; CHECK-O3-NEXT: ret;
+; CHECK-LABEL: test_bitcast_2xfloat_to_double(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .f64 %fd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd2, [test_bitcast_2xfloat_to_double_param_0];
+; CHECK-NEXT: mov.b64 %fd1, %rd2;
+; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
+; CHECK-NEXT: ret;
%r = bitcast <2 x float> %a to double
ret double %r
}
>From 59843e1823a01aaa680f4e0dbc654868c4382262 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Tue, 25 Feb 2025 11:01:48 -0800
Subject: [PATCH 19/25] [NVPTX] add combiner rule for expanding StoreRetval
vector parameters
Do this to reduce the amount of packing movs.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 68 ++++++++++++++++---
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 56 ++++++---------
2 files changed, 80 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8f306883eddca..b8597badb8f35 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4958,26 +4958,78 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
-static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
- std::size_t Back) {
+static SDValue PerformStoreCombineHelper(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ std::size_t Front, std::size_t Back) {
if (all_of(N->ops().drop_front(Front).drop_back(Back),
[](const SDUse &U) { return U.get()->isUndef(); }))
// Operand 0 is the previous value in the chain. Cannot return EntryToken
// as the previous value will become unused and eliminated later.
return N->getOperand(0);
+ auto *MemN = cast<MemSDNode>(N);
+ if (MemN->getMemoryVT() == MVT::v2f32) {
+ // try to fold, and expand:
+ // c: v2f32 = BUILD_VECTOR (a: f32, b: f32)
+ // StoreRetval c
+ // -->
+ // StoreRetvalV2 {a, b}
+ // likewise for V2 -> V4 case
+
+ std::optional<NVPTXISD::NodeType> NewOpcode;
+ switch (N->getOpcode()) {
+ case NVPTXISD::StoreParam:
+ NewOpcode = NVPTXISD::StoreParamV2;
+ break;
+ case NVPTXISD::StoreParamV2:
+ NewOpcode = NVPTXISD::StoreParamV4;
+ break;
+ case NVPTXISD::StoreRetval:
+ NewOpcode = NVPTXISD::StoreRetvalV2;
+ break;
+ case NVPTXISD::StoreRetvalV2:
+ NewOpcode = NVPTXISD::StoreRetvalV4;
+ break;
+ }
+
+ if (NewOpcode) {
+ // copy chain, offset from existing store
+ SmallVector<SDValue> NewOps = {N->getOperand(0), N->getOperand(1)};
+ // gather all operands to expand
+ for (unsigned I = 2, E = N->getNumOperands(); I < E; ++I) {
+ SDValue CurrentOp = N->getOperand(I);
+ if (CurrentOp->getOpcode() == ISD::BUILD_VECTOR) {
+ assert(CurrentOp.getValueType() == MVT::v2f32);
+ NewOps.push_back(CurrentOp.getNode()->getOperand(0));
+ NewOps.push_back(CurrentOp.getNode()->getOperand(1));
+ } else {
+ NewOps.clear();
+ break;
+ }
+ }
+
+ if (!NewOps.empty()) {
+ return DCI.DAG.getMemIntrinsicNode(*NewOpcode, SDLoc(N), N->getVTList(),
+ NewOps, MVT::f32,
+ MemN->getMemOperand());
+ }
+ }
+ }
+
return SDValue();
}
-static SDValue PerformStoreParamCombine(SDNode *N) {
+static SDValue PerformStoreParamCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
// Operands from the 3rd to the 2nd last one are the values to be stored.
// {Chain, ArgID, Offset, Val, Glue}
- return PerformStoreCombineHelper(N, 3, 1);
+ return PerformStoreCombineHelper(N, DCI, 3, 1);
}
-static SDValue PerformStoreRetvalCombine(SDNode *N) {
+static SDValue PerformStoreRetvalCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
// Operands from the 2nd to the last one are the values to be stored
- return PerformStoreCombineHelper(N, 2, 0);
+ return PerformStoreCombineHelper(N, DCI, 2, 0);
}
/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
@@ -5688,11 +5740,11 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case NVPTXISD::StoreRetval:
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
- return PerformStoreRetvalCombine(N);
+ return PerformStoreRetvalCombine(N, DCI);
case NVPTXISD::StoreParam:
case NVPTXISD::StoreParamV2:
case NVPTXISD::StoreParamV4:
- return PerformStoreParamCombine(N);
+ return PerformStoreParamCombine(N, DCI);
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACTCombine(N, DCI);
case ISD::VSELECT:
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 8f4fd3c6e6ee3..1f21740ba589e 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -16,13 +16,11 @@ define <2 x float> @test_ret_const() #0 {
; CHECK-LABEL: test_ret_const(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.f32 %f1, 0f40000000;
; CHECK-NEXT: mov.f32 %f2, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
; CHECK-NEXT: ret;
ret <2 x float> <float 1.0, float 2.0>
}
@@ -243,7 +241,7 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-LABEL: test_fdiv(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1];
@@ -252,8 +250,7 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
; CHECK-NEXT: div.rn.f32 %f5, %f4, %f2;
; CHECK-NEXT: div.rn.f32 %f6, %f3, %f1;
-; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
@@ -264,7 +261,7 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<15>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1];
@@ -283,8 +280,7 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
; CHECK-NEXT: sub.f32 %f13, %f3, %f12;
; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
-; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
; CHECK-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
@@ -468,7 +464,7 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-LABEL: test_fdiv_ftz(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
@@ -477,8 +473,7 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
; CHECK-NEXT: div.rn.ftz.f32 %f5, %f4, %f2;
; CHECK-NEXT: div.rn.ftz.f32 %f6, %f3, %f1;
-; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%r = fdiv <2 x float> %a, %b
ret <2 x float> %r
@@ -489,7 +484,7 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<15>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1];
@@ -508,8 +503,7 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
; CHECK-NEXT: sub.ftz.f32 %f13, %f3, %f12;
; CHECK-NEXT: testp.infinite.f32 %p2, %f1;
; CHECK-NEXT: selp.f32 %f14, %f3, %f13, %p2;
-; CHECK-NEXT: mov.b64 %rd3, {%f14, %f9};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f14, %f9};
; CHECK-NEXT: ret;
%r = frem <2 x float> %a, %b
ret <2 x float> %r
@@ -699,7 +693,7 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<11>;
-; CHECK-NEXT: .reg .b64 %rd<6>;
+; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3];
@@ -714,8 +708,7 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
; CHECK-NEXT: mov.b64 {%f7, %f8}, %rd1;
; CHECK-NEXT: selp.f32 %f9, %f8, %f6, %p2;
; CHECK-NEXT: selp.f32 %f10, %f7, %f5, %p1;
-; CHECK-NEXT: mov.b64 %rd5, {%f10, %f9};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
; CHECK-NEXT: ret;
%cc = fcmp une <2 x float> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
@@ -753,7 +746,7 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .f32 %f<7>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-NEXT: .reg .f64 %fd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
@@ -767,8 +760,7 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
; CHECK-NEXT: selp.f32 %f5, %f4, %f2, %p2;
; CHECK-NEXT: selp.f32 %f6, %f3, %f1, %p1;
-; CHECK-NEXT: mov.b64 %rd3, {%f6, %f5};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-NEXT: ret;
%cc = fcmp une <2 x double> %c, %d
%r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
@@ -1186,14 +1178,12 @@ define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
; CHECK-NEXT: cvt.rn.f32.u32 %f1, %r2;
; CHECK-NEXT: cvt.rn.f32.u32 %f2, %r1;
-; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
; CHECK-NEXT: ret;
%r = uitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
@@ -1203,14 +1193,13 @@ define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 {
; CHECK-LABEL: test_uitofp_2xi64(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
; CHECK-NEXT: cvt.rn.f32.u64 %f1, %rd2;
; CHECK-NEXT: cvt.rn.f32.u64 %f2, %rd1;
-; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
; CHECK-NEXT: ret;
%r = uitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
@@ -1221,14 +1210,12 @@ define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 {
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2;
; CHECK-NEXT: cvt.rn.f32.s32 %f2, %r1;
-; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
; CHECK-NEXT: ret;
%r = sitofp <2 x i32> %a to <2 x float>
ret <2 x float> %r
@@ -1238,14 +1225,13 @@ define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 {
; CHECK-LABEL: test_sitofp_2xi64(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
; CHECK-NEXT: cvt.rn.f32.s64 %f1, %rd2;
; CHECK-NEXT: cvt.rn.f32.s64 %f2, %rd1;
-; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
; CHECK-NEXT: ret;
%r = sitofp <2 x i64> %a to <2 x float>
ret <2 x float> %r
@@ -1276,15 +1262,13 @@ define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
; CHECK-LABEL: test_fptrunc_2xdouble(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-NEXT: .reg .f64 %fd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
; CHECK-NEXT: cvt.rn.f32.f64 %f1, %fd2;
; CHECK-NEXT: cvt.rn.f32.f64 %f2, %fd1;
-; CHECK-NEXT: mov.b64 %rd1, {%f2, %f1};
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
; CHECK-NEXT: ret;
%r = fptrunc <2 x double> %a to <2 x float>
ret <2 x float> %r
>From 49c47712c48e319efb38ae6783c08e7d1d73627c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 28 Feb 2025 15:07:39 -0800
Subject: [PATCH 20/25] [NVPTX] add combiner rule for expanding LOAD, LoadV2,
LoadParam, LoadParamV2
To reduce the number of unpacking movs when the element type is i64 but
all uses are of unpacked f32s.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 388 +++++++++++++++-----
1 file changed, 296 insertions(+), 92 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b8597badb8f35..7f1b43b2095a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -828,7 +828,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
- ISD::TRUNCATE});
+ ISD::TRUNCATE, ISD::LOAD});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -4958,6 +4958,292 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+static std::optional<std::pair<SDValue, SDValue>>
+convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
+ EVT ResVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ assert(ResVT.isVector() && "Vector load must have vector type");
+
+ auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
+ if (!NumEltsAndEltVT)
+ return std::nullopt;
+ auto [NumElts, EltVT] = NumEltsAndEltVT.value();
+
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+
+ Align Alignment = LD->getAlign();
+ auto &TD = DAG.getDataLayout();
+ Align PrefAlign =
+ TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+ if (Alignment < PrefAlign) {
+ // This load is not sufficiently aligned, so bail out and let this vector
+ // load be scalarized. Note that we may still be able to emit smaller
+ // vector loads. For example, if we are loading a <4 x float> with an
+ // alignment of 8, this check will fail but the legalizer will try again
+ // with 2 x <2 x float>, which will succeed with an alignment of 8.
+ return std::nullopt;
+ }
+
+ // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+ // Therefore, we must ensure the type is legal. For i1 and i8, we set the
+ // loaded type to i16 and propagate the "real" type as the memory type.
+ bool NeedTrunc = false;
+ if (EltVT.getSizeInBits() < 16) {
+ EltVT = MVT::i16;
+ NeedTrunc = true;
+ }
+
+ unsigned Opcode = 0;
+ SDVTList LdResVTs;
+
+ switch (NumElts) {
+ default:
+ return std::nullopt;
+ case 2:
+ Opcode = NVPTXISD::LoadV2;
+ LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+ break;
+ case 4: {
+ Opcode = NVPTXISD::LoadV4;
+ EVT ListVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
+ LdResVTs = DAG.getVTList(ListVTs);
+ break;
+ }
+ }
+
+ // Copy regular operands
+ SmallVector<SDValue, 8> OtherOps(N->ops());
+
+ // The select routine does not have access to the LoadSDNode instance, so
+ // pass along the extension information
+ OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+ SDValue NewLD = DAG.getMemIntrinsicNode(
+ Opcode, DL, LdResVTs, OtherOps, LD->getMemoryVT(), LD->getMemOperand());
+
+ SDValue LoadChain = NewLD.getValue(NumElts);
+
+ if (BuildVector) {
+ SmallVector<SDValue> ScalarRes;
+ assert(NumElts <= ResVT.getVectorNumElements() &&
+ "NumElts should not increase, only decrease or stay the same.");
+ if (NumElts < ResVT.getVectorNumElements()) {
+ // If the number of elements has decreased, getVectorLoweringShape has
+ // upsized the element types
+ assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
+ EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
+ // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+ // into individual elements.
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue SubVector = NewLD.getValue(i);
+ DAG.ExtractVectorElements(SubVector, ScalarRes);
+ }
+ } else {
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Res = NewLD.getValue(i);
+ if (NeedTrunc)
+ Res =
+ DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+ ScalarRes.push_back(Res);
+ }
+ }
+
+ SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
+ return {{BuildVec, LoadChain}};
+ }
+
+ return {{NewLD, LoadChain}};
+}
+
+static SDValue PerformLoadCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto *MemN = cast<MemSDNode>(N);
+ EVT MemVT = MemN->getMemoryVT();
+
+ // ignore volatile loads
+ if (MemN->isVolatile())
+ return SDValue();
+
+ // only operate on vectors of f32s / i64s
+ if (!MemVT.isVector())
+ return SDValue();
+
+ EVT ElementVT = MemVT.getVectorElementType();
+ if (!(ElementVT == MVT::f32 ||
+ (ElementVT == MVT::i64 && N->getOpcode() != ISD::LOAD)))
+ return SDValue();
+
+ SmallDenseMap<SDNode *, unsigned> ExtractElts;
+ SDNode *ProxyReg = nullptr;
+ SmallVector<std::pair<SDNode *, unsigned /*offset*/>> WorkList{{N, 0}};
+ while (!WorkList.empty()) {
+ auto [V, Offset] = WorkList.pop_back_val();
+
+ // follow users of this to an extractelt, along the way collecting proxy
+ // regs and bitcasts
+ for (SDUse &U : V->uses()) {
+ if (U.getValueType() == MVT::Other || U.getValueType() == MVT::Glue)
+ continue; // we'll process chain/glue later
+
+ SDNode *User = U.getUser();
+ if (User->getOpcode() == NVPTXISD::ProxyReg) {
+ if (ProxyReg)
+ return SDValue(); // bail out if we've seen a proxy reg?
+ ProxyReg = User;
+ } else if (User->getOpcode() == ISD::BITCAST &&
+ User->getValueType(0) == MVT::v2f32 &&
+ U.getValueType() == MVT::i64) {
+ // match v2f32 = bitcast i64
+ Offset = U.getResNo() * 2;
+ } else if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ User->getValueType(0) == MVT::f32) {
+ // match f32 = extractelt v2f32
+ if (auto *CI = dyn_cast<ConstantSDNode>(User->getOperand(1))) {
+ unsigned Index = CI->getZExtValue();
+ ExtractElts[User] = Offset + Index;
+ continue; // don't search
+ }
+ return SDValue(); // could not match
+ } else
+ return SDValue(); // couldn't match
+
+ // enqueue this to visit its uses
+ WorkList.push_back({User, Offset});
+ }
+ }
+
+ // (2) If the load's value is only used as f32 elements, replace all
+ // extractelts with individual elements of the newly-created load. If there's
+ // a ProxyReg, handle that too. After this check, we'll proceed in the
+ // following way:
+ // 1. Determine which type of load to create, which will split the results
+ // of the original load into f32 components.
+ // 2. If there's a ProxyReg, split that too.
+ // 3. Replace all extractelts with references to the new load / proxy reg.
+ // 4. Replace all glue/chain references with references to the new load /
+ // proxy reg.
+ if (ExtractElts.empty())
+ return SDValue();
+
+ // Do we have to tweak the opcode for an NVPTXISD::Load* or do we have to
+ // rewrite an ISD::LOAD?
+ std::optional<NVPTXISD::NodeType> NewOpcode;
+ switch (N->getOpcode()) {
+ case NVPTXISD::LoadV2:
+ NewOpcode = NVPTXISD::LoadV4;
+ break;
+ case NVPTXISD::LoadParam:
+ NewOpcode = NVPTXISD::LoadParamV2;
+ break;
+ case NVPTXISD::LoadParamV2:
+ NewOpcode = NVPTXISD::LoadParamV4;
+ break;
+ }
+
+ SDValue OldChain, OldGlue;
+ for (unsigned I = 0, E = N->getNumValues(); I != E; ++I) {
+ if (N->getValueType(I) == MVT::Other)
+ OldChain = SDValue(N, I);
+ else if (N->getValueType(I) == MVT::Glue)
+ OldGlue = SDValue(N, I);
+ }
+
+ SDValue NewLoad, NewChain, NewGlue /* (optional) */;
+ unsigned NumElts = 0;
+ if (NewOpcode) { // tweak NVPTXISD::Load* opcode
+ SmallVector<EVT> VTs;
+
+ // should always be non-null after this
+ std::optional<unsigned> NewChainIdx;
+ std::optional<unsigned> NewGlueIdx;
+ for (const EVT &V : N->values()) {
+ if (V == MVT::i64 || V == MVT::v2f32) {
+ VTs.append({MVT::f32, MVT::f32});
+ NumElts += 2;
+ } else {
+ assert((V == MVT::Other || V == MVT::Glue) &&
+ "expected i64,...,ch,glue = load or v2f32,ch = load");
+ if (V == MVT::Other)
+ NewChainIdx = VTs.size();
+ else
+ NewGlueIdx = VTs.size();
+ VTs.push_back(V);
+ }
+ }
+
+ NewLoad = DCI.DAG.getMemIntrinsicNode(
+ *NewOpcode, SDLoc(N), DCI.DAG.getVTList(VTs),
+ SmallVector<SDValue>(N->ops()), MVT::f32, MemN->getMemOperand());
+ NewChain = NewLoad.getValue(*NewChainIdx);
+ if (NewGlueIdx)
+ NewGlue = NewLoad.getValue(*NewGlueIdx);
+ } else if (N->getOpcode() == ISD::LOAD) { // rewrite a load
+ if (auto Result = convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false)) {
+ std::tie(NewLoad, NewChain) = *Result;
+ NumElts = MemVT.getVectorNumElements();
+ if (NewLoad->getValueType(NewLoad->getNumValues() - 1) == MVT::Glue)
+ NewGlue = NewLoad.getValue(NewLoad->getNumValues() - 1);
+ }
+ }
+
+ if (!NewLoad)
+ return SDValue(); // could not match pattern
+
+ // (3) begin rewriting uses
+ SmallVector<SDValue> NewOutputsF32;
+
+ if (ProxyReg) {
+ // scalarize proxyreg, but first rewrite all uses of chain and glue from the
+ // old load to the new load
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
+
+ // Update the new chain and glue to be old inputs to the proxyreg, if they
+ // came from an intervening instruction between this proxyreg and the
+ // original load (ex: callseq_end). Other than bitcasts and extractelts, we
+ // followed all other nodes by chain and glue accesses.
+ if (SDValue OldInChain = ProxyReg->getOperand(0); OldInChain.getNode() != N)
+ NewChain = OldInChain;
+ if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
+ NewGlue = OldInGlue;
+
+ // update OldChain, OldGlue to the outputs of ProxyReg, which we will
+ // replace later
+ OldChain = SDValue(ProxyReg, 1);
+ OldGlue = SDValue(ProxyReg, 2);
+
+ // generate the scalar proxy regs
+ for (unsigned I = 0, E = NumElts; I != E; ++I) {
+ SDValue ProxyRegElem =
+ DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(ProxyReg),
+ DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
+ {NewChain, NewLoad.getValue(I), NewGlue});
+ NewChain = ProxyRegElem.getValue(1);
+ NewGlue = ProxyRegElem.getValue(2);
+ NewOutputsF32.push_back(ProxyRegElem);
+ }
+ } else {
+ for (unsigned I = 0, E = NumElts; I != E; ++I)
+ if (NewLoad->getValueType(I) == MVT::f32)
+ NewOutputsF32.push_back(NewLoad.getValue(I));
+ }
+
+ // now, for all extractelts, replace them with one of the new outputs
+ for (auto &[Extract, Index] : ExtractElts)
+ DCI.CombineTo(Extract, NewOutputsF32[Index], false);
+
+ // now replace all glue and chain nodes
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+ if (OldGlue)
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
+
+ // cleanup
+ if (ProxyReg)
+ DCI.recursivelyDeleteUnusedNodes(ProxyReg);
+ return SDValue();
+}
+
static SDValue PerformStoreCombineHelper(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
std::size_t Front, std::size_t Back) {
@@ -5741,6 +6027,11 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case NVPTXISD::StoreRetvalV2:
case NVPTXISD::StoreRetvalV4:
return PerformStoreRetvalCombine(N, DCI);
+ case ISD::LOAD:
+ case NVPTXISD::LoadV2:
+ case NVPTXISD::LoadParam:
+ case NVPTXISD::LoadParamV2:
+ return PerformLoadCombine(N, DCI);
case NVPTXISD::StoreParam:
case NVPTXISD::StoreParamV2:
case NVPTXISD::StoreParamV4:
@@ -5786,98 +6077,11 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results) {
- EVT ResVT = N->getValueType(0);
- SDLoc DL(N);
-
- assert(ResVT.isVector() && "Vector load must have vector type");
-
- auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
- if (!NumEltsAndEltVT)
- return;
- auto [NumElts, EltVT] = NumEltsAndEltVT.value();
-
- LoadSDNode *LD = cast<LoadSDNode>(N);
-
- Align Alignment = LD->getAlign();
- auto &TD = DAG.getDataLayout();
- Align PrefAlign =
- TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
- if (Alignment < PrefAlign) {
- // This load is not sufficiently aligned, so bail out and let this vector
- // load be scalarized. Note that we may still be able to emit smaller
- // vector loads. For example, if we are loading a <4 x float> with an
- // alignment of 8, this check will fail but the legalizer will try again
- // with 2 x <2 x float>, which will succeed with an alignment of 8.
- return;
+ if (auto Outputs = convertVectorLoad(N, DAG, /*BuildVector=*/true)) {
+ auto [BuildVec, LoadChain] = *Outputs;
+ Results.push_back(BuildVec);
+ Results.push_back(LoadChain);
}
-
- // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
- // Therefore, we must ensure the type is legal. For i1 and i8, we set the
- // loaded type to i16 and propagate the "real" type as the memory type.
- bool NeedTrunc = false;
- if (EltVT.getSizeInBits() < 16) {
- EltVT = MVT::i16;
- NeedTrunc = true;
- }
-
- unsigned Opcode = 0;
- SDVTList LdResVTs;
-
- switch (NumElts) {
- default:
- return;
- case 2:
- Opcode = NVPTXISD::LoadV2;
- LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
- break;
- case 4: {
- Opcode = NVPTXISD::LoadV4;
- EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
- LdResVTs = DAG.getVTList(ListVTs);
- break;
- }
- }
-
- // Copy regular operands
- SmallVector<SDValue, 8> OtherOps(N->ops());
-
- // The select routine does not have access to the LoadSDNode instance, so
- // pass along the extension information
- OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
-
- SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
- LD->getMemoryVT(),
- LD->getMemOperand());
-
- SmallVector<SDValue> ScalarRes;
- assert(NumElts <= ResVT.getVectorNumElements() &&
- "NumElts should not increase, only decrease or stay the same.");
- if (NumElts < ResVT.getVectorNumElements()) {
- // If the number of elements has decreased, getVectorLoweringShape has
- // upsized the element types
- assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
- EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
- // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
- // into individual elements.
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue SubVector = NewLD.getValue(i);
- DAG.ExtractVectorElements(SubVector, ScalarRes);
- }
- } else {
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Res = NewLD.getValue(i);
- if (NeedTrunc)
- Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
- ScalarRes.push_back(Res);
- }
- }
-
- SDValue LoadChain = NewLD.getValue(NumElts);
-
- SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
-
- Results.push_back(BuildVec);
- Results.push_back(LoadChain);
}
// Lower vector return type of tcgen05.ld intrinsics
>From 81b99cadf3afeffee4eb3f9fd3f5453dfb671113 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 5 Mar 2025 22:33:15 -0800
Subject: [PATCH 21/25] [NVPTX] update combiner rule for more types of loads
Handle more loads, including ones with multiple proxy registers:
- i64 = LOAD
- i64 = LoadParam
- v2f32,v2f32 = LoadParamV2
Also update the test cases. Because this is an optimization, it is not
triggered for some of these tests that compile with no optimizations.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 130 +++++++++++-------
llvm/test/CodeGen/NVPTX/aggregate-return.ll | 4 +-
llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 96 ++++++-------
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 57 +++++---
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 5 +-
llvm/test/CodeGen/NVPTX/vector-loads.ll | 48 +++----
llvm/test/CodeGen/NVPTX/vector-stores.ll | 2 +-
7 files changed, 191 insertions(+), 151 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7f1b43b2095a6..827cea3e1e21a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4958,9 +4958,13 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+/// OverrideVT - allows overriding result and memory type
static std::optional<std::pair<SDValue, SDValue>>
-convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
+convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
+ std::optional<EVT> OverrideVT = std::nullopt) {
EVT ResVT = N->getValueType(0);
+ if (OverrideVT)
+ ResVT = *OverrideVT;
SDLoc DL(N);
assert(ResVT.isVector() && "Vector load must have vector type");
@@ -4974,8 +4978,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
Align Alignment = LD->getAlign();
auto &TD = DAG.getDataLayout();
- Align PrefAlign =
- TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+ Align PrefAlign = TD.getPrefTypeAlign(
+ OverrideVT.value_or(LD->getMemoryVT()).getTypeForEVT(*DAG.getContext()));
if (Alignment < PrefAlign) {
// This load is not sufficiently aligned, so bail out and let this vector
// load be scalarized. Note that we may still be able to emit smaller
@@ -5020,7 +5024,8 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
SDValue NewLD = DAG.getMemIntrinsicNode(
- Opcode, DL, LdResVTs, OtherOps, LD->getMemoryVT(), LD->getMemOperand());
+ Opcode, DL, LdResVTs, OtherOps, OverrideVT.value_or(LD->getMemoryVT()),
+ LD->getMemOperand());
SDValue LoadChain = NewLD.getValue(NumElts);
@@ -5059,23 +5064,20 @@ convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector) {
static SDValue PerformLoadCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
auto *MemN = cast<MemSDNode>(N);
- EVT MemVT = MemN->getMemoryVT();
-
- // ignore volatile loads
- if (MemN->isVolatile())
- return SDValue();
-
// only operate on vectors of f32s / i64s
- if (!MemVT.isVector())
+ if (EVT MemVT = MemN->getMemoryVT();
+ !(MemVT == MVT::i64 ||
+ (MemVT.isVector() && (MemVT.getVectorElementType() == MVT::f32 ||
+ MemVT.getVectorElementType() == MVT::i64))))
return SDValue();
- EVT ElementVT = MemVT.getVectorElementType();
- if (!(ElementVT == MVT::f32 ||
- (ElementVT == MVT::i64 && N->getOpcode() != ISD::LOAD)))
- return SDValue();
+ const unsigned OrigNumResults =
+ llvm::count_if(N->values(), [](const auto &VT) {
+ return VT == MVT::i64 || VT == MVT::f32 || VT.isVector();
+ });
SmallDenseMap<SDNode *, unsigned> ExtractElts;
- SDNode *ProxyReg = nullptr;
+ SmallVector<SDNode *> ProxyRegs(OrigNumResults, nullptr);
SmallVector<std::pair<SDNode *, unsigned /*offset*/>> WorkList{{N, 0}};
while (!WorkList.empty()) {
auto [V, Offset] = WorkList.pop_back_val();
@@ -5088,8 +5090,14 @@ static SDValue PerformLoadCombine(SDNode *N,
SDNode *User = U.getUser();
if (User->getOpcode() == NVPTXISD::ProxyReg) {
+ Offset = U.getResNo() * 2;
+ SDNode *&ProxyReg = ProxyRegs[Offset / 2];
+
+ // We shouldn't have multiple proxy regs for the same value from the
+ // load, but bail out anyway since we don't handle this.
if (ProxyReg)
- return SDValue(); // bail out if we've seen a proxy reg?
+ return SDValue();
+
ProxyReg = User;
} else if (User->getOpcode() == ISD::BITCAST &&
User->getValueType(0) == MVT::v2f32 &&
@@ -5179,9 +5187,18 @@ static SDValue PerformLoadCombine(SDNode *N,
if (NewGlueIdx)
NewGlue = NewLoad.getValue(*NewGlueIdx);
} else if (N->getOpcode() == ISD::LOAD) { // rewrite a load
- if (auto Result = convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false)) {
+ std::optional<EVT> CastToType;
+ EVT ResVT = N->getValueType(0);
+ if (ResVT == MVT::i64) {
+ // ld.b64 is treated as a vector by subsequent code
+ CastToType = MVT::v2f32;
+ }
+ if (auto Result =
+ convertVectorLoad(N, DCI.DAG, /*BuildVector=*/false, CastToType)) {
std::tie(NewLoad, NewChain) = *Result;
- NumElts = MemVT.getVectorNumElements();
+ NumElts =
+ CastToType.value_or(cast<MemSDNode>(NewLoad.getNode())->getMemoryVT())
+ .getVectorNumElements();
if (NewLoad->getValueType(NewLoad->getNumValues() - 1) == MVT::Glue)
NewGlue = NewLoad.getValue(NewLoad->getNumValues() - 1);
}
@@ -5193,54 +5210,65 @@ static SDValue PerformLoadCombine(SDNode *N,
// (3) begin rewriting uses
SmallVector<SDValue> NewOutputsF32;
- if (ProxyReg) {
- // scalarize proxyreg, but first rewrite all uses of chain and glue from the
- // old load to the new load
+ if (llvm::any_of(ProxyRegs, [](const SDNode *PR) { return PR != nullptr; })) {
+ // scalarize proxy regs, but first rewrite all uses of chain and glue from
+ // the old load to the new load
DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
- // Update the new chain and glue to be old inputs to the proxyreg, if they
- // came from an intervening instruction between this proxyreg and the
- // original load (ex: callseq_end). Other than bitcasts and extractelts, we
- // followed all other nodes by chain and glue accesses.
- if (SDValue OldInChain = ProxyReg->getOperand(0); OldInChain.getNode() != N)
+ for (unsigned ProxyI = 0, ProxyE = ProxyRegs.size(); ProxyI != ProxyE;
+ ++ProxyI) {
+ SDNode *ProxyReg = ProxyRegs[ProxyI];
+
+ // no proxy reg might mean this result is unused
+ if (!ProxyReg)
+ continue;
+
+ // Update the new chain and glue to be old inputs to the proxyreg, if they
+ // came from an intervening instruction between this proxyreg and the
+ // original load (ex: callseq_end). Other than bitcasts and extractelts,
+ // we followed all other nodes by chain and glue accesses.
+ if (SDValue OldInChain = ProxyReg->getOperand(0);
+ OldInChain.getNode() != N)
NewChain = OldInChain;
- if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
+ if (SDValue OldInGlue = ProxyReg->getOperand(2); OldInGlue.getNode() != N)
NewGlue = OldInGlue;
- // update OldChain, OldGlue to the outputs of ProxyReg, which we will
- // replace later
- OldChain = SDValue(ProxyReg, 1);
- OldGlue = SDValue(ProxyReg, 2);
-
- // generate the scalar proxy regs
- for (unsigned I = 0, E = NumElts; I != E; ++I) {
- SDValue ProxyRegElem =
- DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(ProxyReg),
- DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
- {NewChain, NewLoad.getValue(I), NewGlue});
- NewChain = ProxyRegElem.getValue(1);
- NewGlue = ProxyRegElem.getValue(2);
- NewOutputsF32.push_back(ProxyRegElem);
+ // update OldChain, OldGlue to the outputs of ProxyReg, which we will
+ // replace later
+ OldChain = SDValue(ProxyReg, 1);
+ OldGlue = SDValue(ProxyReg, 2);
+
+ // generate the scalar proxy regs
+ for (unsigned I = 0, E = 2; I != E; ++I) {
+ SDValue ProxyRegElem = DCI.DAG.getNode(
+ NVPTXISD::ProxyReg, SDLoc(ProxyReg),
+ DCI.DAG.getVTList(MVT::f32, MVT::Other, MVT::Glue),
+ {NewChain, NewLoad.getValue(ProxyI * 2 + I), NewGlue});
+ NewChain = ProxyRegElem.getValue(1);
+ NewGlue = ProxyRegElem.getValue(2);
+ NewOutputsF32.push_back(ProxyRegElem);
+ }
+
+ // replace all uses of the glue and chain from the old proxy reg
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
}
} else {
for (unsigned I = 0, E = NumElts; I != E; ++I)
if (NewLoad->getValueType(I) == MVT::f32)
NewOutputsF32.push_back(NewLoad.getValue(I));
+
+ // replace all glue and chain nodes
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
+ if (OldGlue)
+ DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
}
- // now, for all extractelts, replace them with one of the new outputs
+ // replace all extractelts with the new outputs
for (auto &[Extract, Index] : ExtractElts)
DCI.CombineTo(Extract, NewOutputsF32[Index], false);
- // now replace all glue and chain nodes
- DCI.DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
- if (OldGlue)
- DCI.DAG.ReplaceAllUsesOfValueWith(OldGlue, NewGlue);
-
- // cleanup
- if (ProxyReg)
- DCI.recursivelyDeleteUnusedNodes(ProxyReg);
return SDValue();
}
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index cda7d38ccb0b7..4212f18378856 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -27,9 +27,7 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
store <3 x float> %call, ptr %output, align 8
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
-; -- This is suboptimal. We should do st.v2.f32 instead
-; of combining 2xf32 info i64.
-; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
+; CHECK-DAG: st.v2.f32 [{{%rd[0-9]}}], {[[E0]], [[E1]]}
; CHECK: ret;
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 0c1b1e2166928..1e7eb4c5a780f 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -762,32 +762,32 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM70-NEXT: // %bb.0:
; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
-; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
-; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
+; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; SM70-NEXT: cvt.u32.u16 %r5, %rs2;
; SM70-NEXT: shl.b32 %r6, %r5, 16;
; SM70-NEXT: mov.b32 %f1, %r6;
-; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
+; SM70-NEXT: cvt.u32.u16 %r7, %rs1;
; SM70-NEXT: shl.b32 %r8, %r7, 16;
; SM70-NEXT: mov.b32 %f2, %r8;
-; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
+; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
; SM70-NEXT: shl.b32 %r10, %r9, 16;
; SM70-NEXT: mov.b32 %f3, %r10;
-; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
+; SM70-NEXT: cvt.u32.u16 %r11, %rs3;
; SM70-NEXT: shl.b32 %r12, %r11, 16;
; SM70-NEXT: mov.b32 %f4, %r12;
-; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
+; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs6;
; SM70-NEXT: shl.b32 %r14, %r13, 16;
; SM70-NEXT: mov.b32 %f5, %r14;
-; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
+; SM70-NEXT: cvt.u32.u16 %r15, %rs5;
; SM70-NEXT: shl.b32 %r16, %r15, 16;
; SM70-NEXT: mov.b32 %f6, %r16;
-; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
+; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; SM70-NEXT: cvt.u32.u16 %r17, %rs8;
; SM70-NEXT: shl.b32 %r18, %r17, 16;
; SM70-NEXT: mov.b32 %f7, %r18;
-; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r19, %rs7;
; SM70-NEXT: shl.b32 %r20, %r19, 16;
; SM70-NEXT: mov.b32 %f8, %r20;
; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
@@ -804,18 +804,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-NEXT: // %bb.0:
; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
-; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
-; SM80-NEXT: cvt.f32.bf16 %f1, %rs8;
-; SM80-NEXT: cvt.f32.bf16 %f2, %rs7;
-; SM80-NEXT: cvt.f32.bf16 %f3, %rs6;
-; SM80-NEXT: cvt.f32.bf16 %f4, %rs5;
-; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM80-NEXT: cvt.f32.bf16 %f6, %rs3;
-; SM80-NEXT: cvt.f32.bf16 %f7, %rs2;
-; SM80-NEXT: cvt.f32.bf16 %f8, %rs1;
+; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
+; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
+; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; SM80-NEXT: cvt.f32.bf16 %f3, %rs4;
+; SM80-NEXT: cvt.f32.bf16 %f4, %rs3;
+; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; SM80-NEXT: cvt.f32.bf16 %f5, %rs6;
+; SM80-NEXT: cvt.f32.bf16 %f6, %rs5;
+; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; SM80-NEXT: cvt.f32.bf16 %f7, %rs8;
+; SM80-NEXT: cvt.f32.bf16 %f8, %rs7;
; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
; SM80-NEXT: ret;
@@ -830,18 +830,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM80-FTZ-NEXT: // %bb.0:
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
-; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs8;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs7;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs6;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs5;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2;
-; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1;
+; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1;
+; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs4;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs3;
+; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs6;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs5;
+; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs8;
+; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs7;
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
; SM80-FTZ-NEXT: ret;
@@ -856,18 +856,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
; SM90-NEXT: // %bb.0:
; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
-; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
-; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
-; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
-; SM90-NEXT: cvt.f32.bf16 %f1, %rs8;
-; SM90-NEXT: cvt.f32.bf16 %f2, %rs7;
-; SM90-NEXT: cvt.f32.bf16 %f3, %rs6;
-; SM90-NEXT: cvt.f32.bf16 %f4, %rs5;
-; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
-; SM90-NEXT: cvt.f32.bf16 %f6, %rs3;
-; SM90-NEXT: cvt.f32.bf16 %f7, %rs2;
-; SM90-NEXT: cvt.f32.bf16 %f8, %rs1;
+; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r4;
+; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
+; SM90-NEXT: cvt.f32.bf16 %f2, %rs1;
+; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r3;
+; SM90-NEXT: cvt.f32.bf16 %f3, %rs4;
+; SM90-NEXT: cvt.f32.bf16 %f4, %rs3;
+; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r2;
+; SM90-NEXT: cvt.f32.bf16 %f5, %rs6;
+; SM90-NEXT: cvt.f32.bf16 %f6, %rs5;
+; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r1;
+; SM90-NEXT: cvt.f32.bf16 %f7, %rs8;
+; SM90-NEXT: cvt.f32.bf16 %f8, %rs7;
; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
; SM90-NEXT: ret;
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index 1905fec8ab7a8..2a8898a364eea 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -624,15 +624,18 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
; CHECK-F16-NEXT: .reg .pred %p<3>;
; CHECK-F16-NEXT: .reg .b32 %r<3>;
; CHECK-F16-NEXT: .reg .f32 %f<7>;
+; CHECK-F16-NEXT: .reg .b64 %rd<3>;
; CHECK-F16-EMPTY:
; CHECK-F16-NEXT: // %bb.0:
-; CHECK-F16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2;
-; CHECK-F16-NEXT: selp.f32 %f5, %f2, %f4, %p2;
-; CHECK-F16-NEXT: selp.f32 %f6, %f1, %f3, %p1;
+; CHECK-F16-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-F16-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-F16-NEXT: selp.f32 %f5, %f4, %f2, %p2;
+; CHECK-F16-NEXT: selp.f32 %f6, %f3, %f1, %p1;
; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
; CHECK-F16-NEXT: ret;
;
@@ -642,22 +645,25 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
; CHECK-NOF16-NEXT: .reg .b16 %rs<5>;
; CHECK-NOF16-NEXT: .reg .b32 %r<3>;
; CHECK-NOF16-NEXT: .reg .f32 %f<11>;
+; CHECK-NOF16-NEXT: .reg .b64 %rd<3>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %f5, %rs1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs1;
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %f6, %rs3;
-; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %f6, %f5;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %f7, %rs2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %f8, %rs4;
-; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f8, %f7;
-; CHECK-NOF16-NEXT: selp.f32 %f9, %f2, %f4, %p2;
-; CHECK-NOF16-NEXT: selp.f32 %f10, %f1, %f3, %p1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs3;
+; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %f2, %f1;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %f3, %rs2;
+; CHECK-NOF16-NEXT: cvt.f32.f16 %f4, %rs4;
+; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f4, %f3;
+; CHECK-NOF16-NEXT: mov.b64 {%f5, %f6}, %rd2;
+; CHECK-NOF16-NEXT: mov.b64 {%f7, %f8}, %rd1;
+; CHECK-NOF16-NEXT: selp.f32 %f9, %f8, %f6, %p2;
+; CHECK-NOF16-NEXT: selp.f32 %f10, %f7, %f5, %p1;
; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
; CHECK-NOF16-NEXT: ret;
<2 x half> %c, <2 x half> %d) #0 {
@@ -673,14 +679,17 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
; CHECK-NEXT: .reg .b16 %rs<7>;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-NEXT: .reg .f32 %f<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2];
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
-; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3;
-; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4;
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd2;
+; CHECK-NEXT: mov.b64 {%f3, %f4}, %rd1;
+; CHECK-NEXT: setp.neu.f32 %p1, %f3, %f1;
+; CHECK-NEXT: setp.neu.f32 %p2, %f4, %f2;
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1;
; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2;
@@ -1546,9 +1555,11 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0];
+; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd1;
; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %f2;
; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f1;
; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
@@ -1985,10 +1996,12 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
; CHECK-F16-NEXT: .reg .b16 %rs<3>;
; CHECK-F16-NEXT: .reg .b32 %r<6>;
; CHECK-F16-NEXT: .reg .f32 %f<3>;
+; CHECK-F16-NEXT: .reg .b64 %rd<2>;
; CHECK-F16-EMPTY:
; CHECK-F16-NEXT: // %bb.0:
-; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1];
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-F16-NEXT: mov.b64 {%f1, %f2}, %rd1;
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %f2;
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %f1;
; CHECK-F16-NEXT: mov.b32 %r2, {%rs2, %rs1};
@@ -2003,10 +2016,12 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
; CHECK-NOF16-NEXT: .reg .b32 %r<7>;
; CHECK-NOF16-NEXT: .reg .f32 %f<3>;
+; CHECK-NOF16-NEXT: .reg .b64 %rd<2>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
-; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-NOF16-NEXT: mov.b64 {%f1, %f2}, %rd1;
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
; CHECK-NOF16-NEXT: mov.b32 %r2, %f2;
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 1f21740ba589e..e084fc34e92f7 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -513,13 +513,12 @@ define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: test_ldst_v2f32(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
-; CHECK-NEXT: ld.b64 %rd3, [%rd1];
-; CHECK-NEXT: mov.b64 {%f1, %f2}, %rd3;
+; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
; CHECK-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
; CHECK-NEXT: ret;
%t1 = load <2 x float>, ptr %a
diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index d731985ae9710..4c65cdbe25ad9 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -101,18 +101,18 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177
define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst, ptr addrspace(1) noalias readonly align 16 %src) #0 {
; CHECK: ld.global.v4.b32 {%r
%v = load <8 x half>, ptr addrspace(1) %src, align 16
-; CHECK: mov.b32 {%rs
-; CHECK: mov.b32 {%rs
-; CHECK: mov.b32 {%rs
-; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK-DAG: mov.b32 {%[[RS0:rs[0-9]+]], %[[RS1:rs[0-9]+]]}
+; CHECK-DAG: mov.b32 {%[[RS2:rs[0-9]+]], %[[RS3:rs[0-9]+]]}
+; CHECK-DAG: mov.b32 {%[[RS4:rs[0-9]+]], %[[RS5:rs[0-9]+]]}
+; CHECK-DAG: mov.b32 {%[[RS6:rs[0-9]+]], %[[RS7:rs[0-9]+]]}
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS0]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS1]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS2]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS3]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS4]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS5]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS6]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS7]]
%ext = fpext <8 x half> %v to <8 x float>
; CHECK: st.global.v4.f32
; CHECK: st.global.v4.f32
@@ -151,18 +151,18 @@ define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst,
define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalias readonly align 16 %src) #0 {
; CHECK: ld.v4.b32 {%r
%v = load <8 x half>, ptr %src, align 16
-; CHECK: mov.b32 {%rs
-; CHECK: mov.b32 {%rs
-; CHECK: mov.b32 {%rs
-; CHECK: mov.b32 {%rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
-; CHECK: cvt.f32.f16 %f{{.*}}, %rs
+; CHECK-DAG: mov.b32 {%[[RS0:rs[0-9]+]], %[[RS1:rs[0-9]+]]}
+; CHECK-DAG: mov.b32 {%[[RS2:rs[0-9]+]], %[[RS3:rs[0-9]+]]}
+; CHECK-DAG: mov.b32 {%[[RS4:rs[0-9]+]], %[[RS5:rs[0-9]+]]}
+; CHECK-DAG: mov.b32 {%[[RS6:rs[0-9]+]], %[[RS7:rs[0-9]+]]}
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS0]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS1]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS2]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS3]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS4]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS5]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS6]]
+; CHECK-DAG: cvt.f32.f16 %f{{.*}}, %[[RS7]]
%ext = fpext <8 x half> %v to <8 x float>
; CHECK: st.v4.f32
; CHECK: st.v4.f32
diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll
index cbcaf5fc3822e..b1cb23ea8e672 100644
--- a/llvm/test/CodeGen/NVPTX/vector-stores.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll
@@ -2,7 +2,7 @@
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; CHECK-LABEL: .visible .func foo1
-; CHECK: st.v2.f32
+; CHECK: st.u64
define void @foo1(<2 x float> %val, ptr %ptr) {
store <2 x float> %val, ptr %ptr
ret void
>From 418691ad96d60ec98c98e89cbf2c99e50667895f Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Mar 2025 15:26:37 -0800
Subject: [PATCH 22/25] [NVPTX] support generic LDG/LDU for packed data types
Support ld.global.nc.b64/ldu.global.b64 for v2f32 and
ld.global.nc.b32/ldu.global.b32 for v2f16/v2bf16/v2i16/v4i8
Update test cases.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 110 +++++++++++-------
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 4 +
llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 2 +-
llvm/test/CodeGen/NVPTX/ldu-ldg.ll | 34 +++++-
.../NVPTX/load-with-non-coherent-cache.ll | 4 +-
.../NVPTX/read-global-variable-constant.ll | 2 +-
6 files changed, 106 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4f19b4817fc83..862ef42f6fa78 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1276,6 +1276,9 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
EVT OrigType = N->getValueType(0);
EVT EltVT = Mem->getMemoryVT();
unsigned NumElts = 1;
+
+ std::optional<unsigned> Opcode;
+
if (EltVT.isVector()) {
NumElts = EltVT.getVectorNumElements();
EltVT = EltVT.getVectorElementType();
@@ -1288,6 +1291,24 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
(EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
assert(NumElts % OrigType.getVectorNumElements() == 0 &&
"NumElts must be divisible by the number of elts in subvectors");
+ if (N->getOpcode() == ISD::LOAD ||
+ N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+ switch (OrigType.getSimpleVT().SimpleTy) {
+ case MVT::v2f32:
+ Opcode = N->getOpcode() == ISD::LOAD ? NVPTX::INT_PTX_LDG_GLOBAL_b64
+ : NVPTX::INT_PTX_LDU_GLOBAL_b64;
+ break;
+ case MVT::v2f16:
+ case MVT::v2bf16:
+ case MVT::v2i16:
+ case MVT::v4i8:
+ Opcode = N->getOpcode() == ISD::LOAD ? NVPTX::INT_PTX_LDG_GLOBAL_b32
+ : NVPTX::INT_PTX_LDU_GLOBAL_b32;
+ break;
+ default:
+ llvm_unreachable("Unhandled packed vector type");
+ }
+ }
EltVT = OrigType;
NumElts /= OrigType.getVectorNumElements();
}
@@ -1309,50 +1330,51 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
SelectADDR(Op1, Base, Offset);
SDValue Ops[] = {Base, Offset, Chain};
- std::optional<unsigned> Opcode;
- switch (N->getOpcode()) {
- default:
- return false;
- case ISD::LOAD:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
- NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
- NVPTX::INT_PTX_LDG_GLOBAL_i64, NVPTX::INT_PTX_LDG_GLOBAL_f32,
- NVPTX::INT_PTX_LDG_GLOBAL_f64);
- break;
- case ISD::INTRINSIC_W_CHAIN:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
- NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
- NVPTX::INT_PTX_LDU_GLOBAL_i64, NVPTX::INT_PTX_LDU_GLOBAL_f32,
- NVPTX::INT_PTX_LDU_GLOBAL_f64);
- break;
- case NVPTXISD::LoadV2:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
- NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
- NVPTX::INT_PTX_LDG_G_v2i64_ELE, NVPTX::INT_PTX_LDG_G_v2f32_ELE,
- NVPTX::INT_PTX_LDG_G_v2f64_ELE);
- break;
- case NVPTXISD::LDUV2:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
- NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
- NVPTX::INT_PTX_LDU_G_v2i64_ELE, NVPTX::INT_PTX_LDU_G_v2f32_ELE,
- NVPTX::INT_PTX_LDU_G_v2f64_ELE);
- break;
- case NVPTXISD::LoadV4:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
- NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
- std::nullopt, NVPTX::INT_PTX_LDG_G_v4f32_ELE, std::nullopt);
- break;
- case NVPTXISD::LDUV4:
- Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE,
- NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE,
- std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt);
- break;
+ if (!Opcode) {
+ switch (N->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD:
+ Opcode = pickOpcodeForVT(
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
+ NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
+ NVPTX::INT_PTX_LDG_GLOBAL_i64, NVPTX::INT_PTX_LDG_GLOBAL_f32,
+ NVPTX::INT_PTX_LDG_GLOBAL_f64);
+ break;
+ case ISD::INTRINSIC_W_CHAIN:
+ Opcode = pickOpcodeForVT(
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
+ NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
+ NVPTX::INT_PTX_LDU_GLOBAL_i64, NVPTX::INT_PTX_LDU_GLOBAL_f32,
+ NVPTX::INT_PTX_LDU_GLOBAL_f64);
+ break;
+ case NVPTXISD::LoadV2:
+ Opcode = pickOpcodeForVT(
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
+ NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
+ NVPTX::INT_PTX_LDG_G_v2i64_ELE, NVPTX::INT_PTX_LDG_G_v2f32_ELE,
+ NVPTX::INT_PTX_LDG_G_v2f64_ELE);
+ break;
+ case NVPTXISD::LDUV2:
+ Opcode = pickOpcodeForVT(
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
+ NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
+ NVPTX::INT_PTX_LDU_G_v2i64_ELE, NVPTX::INT_PTX_LDU_G_v2f32_ELE,
+ NVPTX::INT_PTX_LDU_G_v2f64_ELE);
+ break;
+ case NVPTXISD::LoadV4:
+ Opcode = pickOpcodeForVT(
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
+ NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
+ std::nullopt, NVPTX::INT_PTX_LDG_G_v4f32_ELE, std::nullopt);
+ break;
+ case NVPTXISD::LDUV4:
+ Opcode = pickOpcodeForVT(
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE,
+ NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE,
+ std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt);
+ break;
+ }
}
if (!Opcode)
return false;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f6150ee9db26e..f6a0861a01026 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2702,7 +2702,9 @@ class LDU_G<string TyStr, NVPTXRegClass regclass>
def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
+def INT_PTX_LDU_GLOBAL_b32 : LDU_G<"b32", Int32Regs>;
def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
+def INT_PTX_LDU_GLOBAL_b64 : LDU_G<"b64", Int64Regs>;
def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
@@ -2752,7 +2754,9 @@ class LDG_G<string TyStr, NVPTXRegClass regclass>
def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
+def INT_PTX_LDG_GLOBAL_b32 : LDG_G<"b32", Int32Regs>;
def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
+def INT_PTX_LDG_GLOBAL_b64 : LDG_G<"b64", Int64Regs>;
def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index 16a0189e784bd..f68b281f875d9 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -32,7 +32,7 @@ define half @ld_global_v2f16(ptr addrspace(1) %ptr) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f16_param_0];
-; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 4c5c44a9bf44d..3f760649746a8 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -12,6 +12,7 @@ declare float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
declare double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
declare half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
declare <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p1(ptr addrspace(1) %ptr, i32 %align)
declare i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 %align)
declare i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 %align)
@@ -22,6 +23,7 @@ declare float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 %align)
declare double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 %align)
declare half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 %align)
declare <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 %align)
+declare <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p1(ptr addrspace(1) %ptr, i32 %align)
define i8 @test_ldu_i8(ptr addrspace(1) %ptr) {
; CHECK-LABEL: test_ldu_i8(
@@ -154,13 +156,27 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_v2f16_param_0];
-; CHECK-NEXT: ldu.global.u32 %r1, [%rd1];
+; CHECK-NEXT: ldu.global.b32 %r1, [%rd1];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
ret <2 x half> %val
}
+define <2 x float> @test_ldu_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: test_ldu_v2f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_v2f32_param_0];
+; CHECK-NEXT: ldu.global.b64 %rd2, [%rd1];
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
+ %val = tail call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p1(ptr addrspace(1) %ptr, i32 8)
+ ret <2 x float> %val
+}
+
define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
; CHECK-LABEL: test_ldg_i8(
; CHECK: {
@@ -291,13 +307,27 @@ define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_v2f16_param_0];
-; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1];
+; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4)
ret <2 x half> %val
}
+define <2 x float> @test_ldg_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: test_ldg_v2f32(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_v2f32_param_0];
+; CHECK-NEXT: ld.global.nc.b64 %rd2, [%rd1];
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
+ %val = tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p1(ptr addrspace(1) %ptr, i32 8)
+ ret <2 x float> %val
+}
+
@g = addrspace(1) global i32 0
define i32 @test_ldg_asi() {
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index dc1917f3b1507..194c043ef257c 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -82,7 +82,7 @@ define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: .visible .entry foo8(
; SM20: ld.global.u32
; SM35-LABEL: .visible .entry foo8(
-; SM35: ld.global.nc.u32
+; SM35: ld.global.nc.b32
define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
%1 = load <2 x i16>, ptr %from
store <2 x i16> %1, ptr %to
@@ -132,7 +132,7 @@ define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: .visible .entry foo13(
; SM20: ld.global.u32
; SM35-LABEL: .visible .entry foo13(
-; SM35: ld.global.nc.u32
+; SM35: ld.global.nc.b32
define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
%1 = load <4 x i8>, ptr %from
store <4 x i8> %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
index 1d14be9070b07..fd74fc9c76387 100644
--- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
+++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll
@@ -17,7 +17,7 @@ define float @test_gv_float() {
; CHECK-LABEL: test_gv_float2()
define <2 x float> @test_gv_float2() {
-; CHECK: ld.global.nc.v2.f32
+; CHECK: ld.global.nc.b64
%v = load <2 x float>, ptr @gv_float2
ret <2 x float> %v
}
>From 5cda51fb827a79bd34baaab24ad33fdc09e411d2 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 6 Mar 2025 20:16:03 -0800
Subject: [PATCH 23/25] [NVPTX] fold v2f32 = bitcast (i64,i64,... =
NVPTXISD::Load*)
Fold i64->v2f32 bitcasts on the results of a NVPTXISD::Load* op.
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 12 +--
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 59 +++++++++++++-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 80 +++++++++----------
llvm/test/CodeGen/NVPTX/vec-param-load.ll | 32 ++++----
4 files changed, 121 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 862ef42f6fa78..c923faa7482e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1165,7 +1165,9 @@ static bool isVectorElementTypeUpsized(EVT EltVT) {
// In order to load/store such vectors efficiently, in Type Legalization
// we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
// lower to PTX as vectors of b32.
- return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
+ // We also consider v2f32 as an upsized type, which may be used in packed
+ // (f32x2) instructions.
+ return Isv2x16VT(EltVT) || EltVT == MVT::v4i8 || EltVT == MVT::v2f32;
}
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
@@ -1221,9 +1223,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
EVT EltVT = N->getValueType(0);
if (isVectorElementTypeUpsized(EltVT)) {
- EltVT = MVT::i32;
+ FromTypeWidth = EltVT.getSizeInBits();
+ EltVT = MVT::getIntegerVT(FromTypeWidth);
FromType = NVPTX::PTXLdStInstCode::Untyped;
- FromTypeWidth = 32;
}
SDValue Offset, Base;
@@ -1548,9 +1550,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
}
if (isVectorElementTypeUpsized(EltVT)) {
- EltVT = MVT::i32;
+ ToTypeWidth = EltVT.getSizeInBits();
+ EltVT = MVT::getIntegerVT(ToTypeWidth);
ToType = NVPTX::PTXLdStInstCode::Untyped;
- ToTypeWidth = 32;
}
SDValue Offset, Base;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 827cea3e1e21a..b51bac8c22317 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -828,7 +828,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
- ISD::TRUNCATE, ISD::LOAD});
+ ISD::TRUNCATE, ISD::LOAD, ISD::BITCAST});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6031,6 +6031,61 @@ static SDValue PerformTRUNCATECombine(SDNode *N,
return SDValue();
}
+static SDValue PerformBITCASTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (N->getValueType(0) != MVT::v2f32)
+ return SDValue();
+
+ SDValue Operand = N->getOperand(0);
+ if (Operand.getValueType() != MVT::i64)
+ return SDValue();
+
+ // DAGCombiner handles bitcast(ISD::LOAD) already. For these, we'll do the
+ // same thing, by changing their output values from i64 to v2f32. Then the
+ // rule for combining loads (see PerformLoadCombine) may split these loads
+ // further.
+ if (Operand.getOpcode() == NVPTXISD::LoadV2 ||
+ Operand.getOpcode() == NVPTXISD::LoadParam ||
+ Operand.getOpcode() == NVPTXISD::LoadParamV2) {
+ // check for all bitcasts
+ SmallVector<std::pair<SDNode *, unsigned /* resno */>> OldUses;
+ for (SDUse &U : Operand->uses()) {
+ SDNode *User = U.getUser();
+ if (!(User->getOpcode() == ISD::BITCAST &&
+ User->getValueType(0) == MVT::v2f32 &&
+ U.getValueType() == MVT::i64))
+ return SDValue(); // unhandled pattern
+ OldUses.push_back({User, U.getResNo()});
+ }
+
+ auto *MemN = cast<MemSDNode>(Operand);
+ SmallVector<EVT> VTs;
+ for (const auto &VT : Operand->values()) {
+ if (VT == MVT::i64)
+ VTs.push_back(MVT::v2f32);
+ else
+ VTs.push_back(VT);
+ }
+
+ SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
+ Operand.getOpcode(), SDLoc(Operand), DCI.DAG.getVTList(VTs),
+ SmallVector<SDValue>(Operand->ops()), MemN->getMemoryVT(),
+ MemN->getMemOperand());
+
+ // replace all chain/glue uses of the old load
+ for (unsigned I = 0, E = Operand->getNumValues(); I != E; ++I)
+ if (Operand->getValueType(I) != MVT::i64)
+ DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(MemN, I),
+ NewLoad.getValue(I));
+
+ // replace all bitcasts with values from the new load
+ for (auto &[BC, ResultNum] : OldUses)
+ DCI.CombineTo(BC, NewLoad.getValue(ResultNum), false);
+ }
+
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6076,6 +6131,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformFP_ROUNDCombine(N, DCI);
case ISD::TRUNCATE:
return PerformTRUNCATECombine(N, DCI);
+ case ISD::BITCAST:
+ return PerformBITCASTCombine(N, DCI);
}
return SDValue();
}
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index e084fc34e92f7..2c885122b71fe 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -117,14 +117,14 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: test_fadd_v4(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<11>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_param_1];
-; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_param_0];
-; CHECK-NEXT: add.rn.f32x2 %rd9, %rd8, %rd6;
-; CHECK-NEXT: add.rn.f32x2 %rd10, %rd7, %rd5;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
+; CHECK-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4;
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -134,19 +134,19 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_0_v4(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
; CHECK-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -156,19 +156,19 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
; CHECK-LABEL: test_fadd_imm_1_v4(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-NEXT: add.rn.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3;
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-NEXT: add.rn.f32x2 %rd8, %rd3, %rd7;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
+; CHECK-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
@@ -340,14 +340,14 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
; CHECK-LABEL: test_fadd_v4_ftz(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<11>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [test_fadd_v4_ftz_param_1];
-; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [test_fadd_v4_ftz_param_0];
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd9, %rd8, %rd6;
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd10, %rd7, %rd5;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd10, %rd9};
+; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4;
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
@@ -357,19 +357,19 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
; CHECK-LABEL: test_fadd_imm_0_v4_ftz(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
; CHECK-NEXT: ret;
%r = fadd <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %a
ret <4 x float> %r
@@ -379,19 +379,19 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
; CHECK-LABEL: test_fadd_imm_1_v4_ftz(
; CHECK: {
; CHECK-NEXT: .reg .f32 %f<5>;
-; CHECK-NEXT: .reg .b64 %rd<9>;
+; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
; CHECK-NEXT: mov.f32 %f1, 0f40800000;
; CHECK-NEXT: mov.f32 %f2, 0f40400000;
-; CHECK-NEXT: mov.b64 %rd5, {%f2, %f1};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd4, %rd5;
+; CHECK-NEXT: mov.b64 %rd3, {%f2, %f1};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3;
; CHECK-NEXT: mov.f32 %f3, 0f40000000;
; CHECK-NEXT: mov.f32 %f4, 0f3F800000;
-; CHECK-NEXT: mov.b64 %rd7, {%f4, %f3};
-; CHECK-NEXT: add.rn.ftz.f32x2 %rd8, %rd3, %rd7;
-; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd6};
+; CHECK-NEXT: mov.b64 %rd5, {%f4, %f3};
+; CHECK-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5;
+; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4};
; CHECK-NEXT: ret;
%r = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
ret <4 x float> %r
diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
index 5dea424c7dcc9..d50d0828faf65 100644
--- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll
@@ -5,40 +5,40 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define <16 x float> @test_v16f32(<16 x float> %a) {
; CHECK-LABEL: test_v16f32(
-; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
-; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
-; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
-; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
-; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
-; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
-; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]}
-; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]}
+; CHECK-DAG: ld.param.v2.b64 {[[V_12_15:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+48];
+; CHECK-DAG: ld.param.v2.b64 {[[V_8_11:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+32];
+; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0+16];
+; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v16f32_param_0];
+; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
+; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
+; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_8_11]]}
+; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_12_15]]}
; CHECK: ret;
ret <16 x float> %a
}
define <8 x float> @test_v8f32(<8 x float> %a) {
; CHECK-LABEL: test_v8f32(
-; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
-; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
-; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
-; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
+; CHECK-DAG: ld.param.v2.b64 {[[V_4_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0+16];
+; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8f32_param_0];
+; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
+; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_4_7]]}
; CHECK: ret;
ret <8 x float> %a
}
define <4 x float> @test_v4f32(<4 x float> %a) {
; CHECK-LABEL: test_v4f32(
-; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
-; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]}
+; CHECK-DAG: ld.param.v2.b64 {[[V_0_3:(%rd[0-9]+[, ]*){2}]]}, [test_v4f32_param_0];
+; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_3]]}
; CHECK: ret;
ret <4 x float> %a
}
define <2 x float> @test_v2f32(<2 x float> %a) {
; CHECK-LABEL: test_v2f32(
-; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
-; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_3]]}
+; CHECK-DAG: ld.param.b64 [[V_0_3:%rd[0-9]+]], [test_v2f32_param_0];
+; CHECK-DAG: st.param.b64 [func_retval0], [[V_0_3]]
; CHECK: ret;
ret <2 x float> %a
}
>From 4c67342546d8b25027ed543d5c63715d7e75060d Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Wed, 12 Mar 2025 16:54:40 -0700
Subject: [PATCH 24/25] [NVPTX] handle more cases for loads and stores
Split unaligned stores and loads of v2f32.
Add DAGCombiner rules for:
- target-independent stores that store a v2f32 BUILD_VECTOR. We
scalarize the value and rewrite the store
Fix test cases.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 56 ++++++++++++++-----
llvm/test/CodeGen/NVPTX/aggregate-return.ll | 4 +-
llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 7 +--
.../NVPTX/load-with-non-coherent-cache.ll | 4 +-
.../CodeGen/NVPTX/misaligned-vector-ldst.ll | 6 +-
5 files changed, 52 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b51bac8c22317..effbe532a9601 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -828,7 +828,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT,
ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::FP_ROUND,
- ISD::TRUNCATE, ISD::LOAD, ISD::BITCAST});
+ ISD::TRUNCATE, ISD::LOAD, ISD::STORE, ISD::BITCAST});
// setcc for f16x2 and bf16x2 needs special handling to prevent
// legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -2992,10 +2992,10 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::i1)
return LowerLOADi1(Op, DAG);
- // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
- // unaligned loads and have to handle it here.
+ // v2f16/v2bf16/v2i16/v4i8/v2f32 are legal, so we can't rely on legalizer to
+ // handle unaligned loads and have to handle it here.
EVT VT = Op.getValueType();
- if (Isv2x16VT(VT) || VT == MVT::v4i8) {
+ if (Isv2x16VT(VT) || VT == MVT::v4i8 || VT == MVT::v2f32) {
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT MemVT = Load->getMemoryVT();
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
@@ -3039,15 +3039,15 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::i1)
return LowerSTOREi1(Op, DAG);
- // v2f16 is legal, so we can't rely on legalizer to handle unaligned
- // stores and have to handle it here.
- if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
+ // v2f16/v2bf16/v2i16/v4i8/v2f32 are legal, so we can't rely on legalizer to
+ // handle unaligned stores and have to handle it here.
+ if ((Isv2x16VT(VT) || VT == MVT::v4i8 || VT == MVT::v2f32) &&
!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
VT, *Store->getMemOperand()))
return expandUnalignedStore(Store, DAG);
- // v2f16, v2bf16 and v2i16 don't need special handling.
- if (Isv2x16VT(VT) || VT == MVT::v4i8)
+ // v2f16/v2bf16/v2i16/v4i8/v2f32 don't need special handling.
+ if (Isv2x16VT(VT) || VT == MVT::v4i8 || VT == MVT::v2f32)
return SDValue();
if (VT.isVector())
@@ -3056,8 +3056,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue
-NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+static SDValue convertVectorStore(SDValue Op, SelectionDAG &DAG,
+ const SmallVectorImpl<SDValue> &Elements) {
SDNode *N = Op.getNode();
SDValue Val = N->getOperand(1);
SDLoc DL(N);
@@ -3124,6 +3124,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts);
Ops.push_back(SubVector);
}
+ } else if (!Elements.empty()) {
+ Ops.insert(Ops.end(), Elements.begin(), Elements.end());
} else {
for (unsigned i = 0; i < NumElts; ++i) {
SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
@@ -3141,10 +3143,19 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
MemSD->getMemoryVT(), MemSD->getMemOperand());
- // return DCI.CombineTo(N, NewSt, true);
return NewSt;
}
+// Default variant where we don't pass in elements.
+static SDValue convertVectorStore(SDValue Op, SelectionDAG &DAG) {
+ return convertVectorStore(Op, DAG, SmallVector<SDValue>{});
+}
+
+SDValue NVPTXTargetLowering::LowerSTOREVector(SDValue Op,
+ SelectionDAG &DAG) const {
+ return convertVectorStore(Op, DAG);
+}
+
// st i1 v, addr
// =>
// v1 = zxt v to i16
@@ -5289,6 +5300,9 @@ static SDValue PerformStoreCombineHelper(SDNode *N,
// -->
// StoreRetvalV2 {a, b}
// likewise for V2 -> V4 case
+ //
+ // We also handle target-independent stores, which require us to first
+ // convert to StoreV2.
std::optional<NVPTXISD::NodeType> NewOpcode;
switch (N->getOpcode()) {
@@ -5314,8 +5328,8 @@ static SDValue PerformStoreCombineHelper(SDNode *N,
SDValue CurrentOp = N->getOperand(I);
if (CurrentOp->getOpcode() == ISD::BUILD_VECTOR) {
assert(CurrentOp.getValueType() == MVT::v2f32);
- NewOps.push_back(CurrentOp.getNode()->getOperand(0));
- NewOps.push_back(CurrentOp.getNode()->getOperand(1));
+ NewOps.push_back(CurrentOp.getOperand(0));
+ NewOps.push_back(CurrentOp.getOperand(1));
} else {
NewOps.clear();
break;
@@ -6086,6 +6100,18 @@ static SDValue PerformBITCASTCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformStoreCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // check if the store'd value can be scalarized
+ SDValue StoredVal = N->getOperand(1);
+ if (StoredVal.getValueType() == MVT::v2f32 &&
+ StoredVal.getOpcode() == ISD::BUILD_VECTOR) {
+ SmallVector<SDValue> Elements(StoredVal->op_values());
+ return convertVectorStore(SDValue(N, 0), DCI.DAG, Elements);
+ }
+ return SDValue();
+}
+
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6115,6 +6141,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
case NVPTXISD::LoadParam:
case NVPTXISD::LoadParamV2:
return PerformLoadCombine(N, DCI);
+ case ISD::STORE:
+ return PerformStoreCombine(N, DCI);
case NVPTXISD::StoreParam:
case NVPTXISD::StoreParamV2:
case NVPTXISD::StoreParamV4:
diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
index 4212f18378856..1101abcdc3278 100644
--- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -10,9 +10,9 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
; CHECK-LABEL: @test_v2f32
%call = tail call <2 x float> @barv(<2 x float> %input)
; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
+; CHECK: ld.param.b64 [[E0_1:%rd[0-9]+]], [retval0];
store <2 x float> %call, ptr %output, align 8
-; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
+; CHECK: st.b64 [{{%rd[0-9]+}}], [[E0_1]]
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index 2c885122b71fe..ae38f1f07f026 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -512,14 +512,13 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
define void @test_ldst_v2f32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: test_ldst_v2f32(
; CHECK: {
-; CHECK-NEXT: .reg .f32 %f<3>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f32_param_1];
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f32_param_0];
-; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1];
-; CHECK-NEXT: st.v2.f32 [%rd2], {%f1, %f2};
+; CHECK-NEXT: ld.b64 %rd3, [%rd1];
+; CHECK-NEXT: st.b64 [%rd2], %rd3;
; CHECK-NEXT: ret;
%t1 = load <2 x float>, ptr %a
store <2 x float> %t1, ptr %b, align 32
diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
index 194c043ef257c..1b6a52035397c 100644
--- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
+++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll
@@ -110,9 +110,9 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
}
; SM20-LABEL: .visible .entry foo11(
-; SM20: ld.global.v2.f32
+; SM20: ld.global.b64
; SM35-LABEL: .visible .entry foo11(
-; SM35: ld.global.nc.v2.f32
+; SM35: ld.global.nc.b64
define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
%1 = load <2 x float>, ptr %from
store <2 x float> %1, ptr %to
diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index b3abcc1a21d2c..b200d8b23fe62 100644
--- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -18,7 +18,7 @@ define <4 x float> @t1(ptr %p1) {
define <4 x float> @t2(ptr %p1) {
; CHECK-NOT: ld.v4
; CHECK-NOT: ld.v2
-; CHECK: ld.f32
+; CHECK: ld.u32
%r = load <4 x float>, ptr %p1, align 4
ret <4 x float> %r
}
@@ -26,7 +26,7 @@ define <4 x float> @t2(ptr %p1) {
; CHECK-LABEL: t3
define <4 x float> @t3(ptr %p1) {
; CHECK-NOT: ld.v4
-; CHECK: ld.v2
+; CHECK: ld.b64
%r = load <4 x float>, ptr %p1, align 8
ret <4 x float> %r
}
@@ -111,7 +111,7 @@ define void @s1(ptr %p1, <4 x float> %v) {
define void @s2(ptr %p1, <4 x float> %v) {
; CHECK-NOT: st.v4
; CHECK-NOT: st.v2
-; CHECK: st.f32
+; CHECK: st.u32
store <4 x float> %v, ptr %p1, align 4
ret void
}
>From 520da7984feb27c816cafc5e756b084a2cae4ea4 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 14 Mar 2025 18:56:29 -0700
Subject: [PATCH 25/25] [NVPTX] add coverage for v2f32 in ldg-invariant and
fp-contract
for fp-contract:
- test folding of fma.f32x2
- bump SM version to 100
for ldg-invariant:
- test proper splitting of loads on vectors of f32
---
llvm/test/CodeGen/NVPTX/fp-contract.ll | 30 ++++++++--
llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 70 ++++++++++++++++++++++++
2 files changed, 96 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll
index 9da9a8691098b..03835954514ef 100644
--- a/llvm/test/CodeGen/NVPTX/fp-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fp-contract.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefix=DEFAULT
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch=sm_100 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch=sm_100 %}
target triple = "nvptx64-unknown-cuda"
@@ -33,3 +33,25 @@ define float @t1(float %a, float %b) {
%v1 = fadd float %a, %b
ret float %v1
}
+
+;; FAST-LABEL: @t0_v2
+;; DEFAULT-LABEL: @t0_v2
+define <2 x float> @t0_v2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+;; FAST: fma.rn.f32x2
+;; DEFAULT: mul.rn.f32x2
+;; DEFAULT: add.rn.f32x2
+ %v0 = fmul <2 x float> %a, %b
+ %v1 = fadd <2 x float> %v0, %c
+ ret <2 x float> %v1
+}
+
+;; FAST-LABEL: @t1_v2
+;; DEFAULT-LABEL: @t1_v2
+define <2 x float> @t1_v2(<2 x float> %a, <2 x float> %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+ %v1 = fadd <2 x float> %a, %b
+ ret <2 x float> %v1
+}
diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
index f68b281f875d9..5ec92212756d9 100644
--- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -127,6 +127,76 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) {
ret half %sum
}
+define float @ld_global_v2f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: ld_global_v2f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f32_param_0];
+; CHECK-NEXT: ld.global.nc.v2.f32 {%f1, %f2}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
+; CHECK-NEXT: ret;
+ %a = load <2 x float>, ptr addrspace(1) %ptr, !invariant.load !0
+ %v1 = extractelement <2 x float> %a, i32 0
+ %v2 = extractelement <2 x float> %a, i32 1
+ %sum = fadd float %v1, %v2
+ ret float %sum
+}
+
+define float @ld_global_v4f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: ld_global_v4f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<8>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f32_param_0];
+; CHECK-NEXT: ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f5, %f1, %f2;
+; CHECK-NEXT: add.rn.f32 %f6, %f3, %f4;
+; CHECK-NEXT: add.rn.f32 %f7, %f5, %f6;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f7;
+; CHECK-NEXT: ret;
+ %a = load <4 x float>, ptr addrspace(1) %ptr, !invariant.load !0
+ %v1 = extractelement <4 x float> %a, i32 0
+ %v2 = extractelement <4 x float> %a, i32 1
+ %v3 = extractelement <4 x float> %a, i32 2
+ %v4 = extractelement <4 x float> %a, i32 3
+ %sum1 = fadd float %v1, %v2
+ %sum2 = fadd float %v3, %v4
+ %sum = fadd float %sum1, %sum2
+ ret float %sum
+}
+
+define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: ld_global_v8f32(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<12>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f32_param_0];
+; CHECK-NEXT: ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1+16];
+; CHECK-NEXT: ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1];
+; CHECK-NEXT: add.rn.f32 %f9, %f5, %f7;
+; CHECK-NEXT: add.rn.f32 %f10, %f1, %f3;
+; CHECK-NEXT: add.rn.f32 %f11, %f9, %f10;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f11;
+; CHECK-NEXT: ret;
+ %a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
+ %v1 = extractelement <8 x float> %a, i32 0
+ %v2 = extractelement <8 x float> %a, i32 2
+ %v3 = extractelement <8 x float> %a, i32 4
+ %v4 = extractelement <8 x float> %a, i32 6
+ %sum1 = fadd float %v1, %v2
+ %sum2 = fadd float %v3, %v4
+ %sum = fadd float %sum1, %sum2
+ ret float %sum
+}
+
define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) {
; CHECK-LABEL: ld_global_v8i8(
; CHECK: {
More information about the llvm-commits
mailing list