[llvm] [AMDGPU] Auto-generated some lit test patterns (NFC). (PR #94310)
Christudasan Devadasan via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 3 22:02:33 PDT 2024
https://github.com/cdevadas created https://github.com/llvm/llvm-project/pull/94310
Also, made R600 RUN lines into standalone tests.
>From 9f63645d21bed98f36639a7ca716d7232e523520 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 3 Jun 2024 12:26:06 +0530
Subject: [PATCH 1/7] Fixed test CodeGen/AMDGPU/xor.ll.
---
llvm/test/CodeGen/AMDGPU/xor-r600.ll | 478 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/xor.ll | 766 +++++++++++++++++++++++----
2 files changed, 1144 insertions(+), 100 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/xor-r600.ll
diff --git a/llvm/test/CodeGen/AMDGPU/xor-r600.ll b/llvm/test/CodeGen/AMDGPU/xor-r600.ll
new file mode 100644
index 0000000000000..3fb11f4484bd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/xor-r600.ll
@@ -0,0 +1,478 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600 %s
+
+define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: xor_v2i32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
+; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV T0.X, KC0[2].Z,
+; R600-NEXT: MOV * T1.X, KC0[2].W,
+; R600-NEXT: ALU clause starting at 12:
+; R600-NEXT: XOR_INT * T0.Y, T0.Y, T1.Y,
+; R600-NEXT: XOR_INT T0.X, T0.X, T1.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load <2 x i32>, ptr addrspace(1) %in0
+ %b = load <2 x i32>, ptr addrspace(1) %in1
+ %result = xor <2 x i32> %a, %b
+ store <2 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: xor_v4i32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 5, @12, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; R600-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV T0.X, KC0[2].Z,
+; R600-NEXT: MOV * T1.X, KC0[2].W,
+; R600-NEXT: ALU clause starting at 12:
+; R600-NEXT: XOR_INT * T0.W, T0.W, T1.W,
+; R600-NEXT: XOR_INT * T0.Z, T0.Z, T1.Z,
+; R600-NEXT: XOR_INT * T0.Y, T0.Y, T1.Y,
+; R600-NEXT: XOR_INT T0.X, T0.X, T1.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load <4 x i32>, ptr addrspace(1) %in0
+ %b = load <4 x i32>, ptr addrspace(1) %in1
+ %result = xor <4 x i32> %a, %b
+ store <4 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: xor_i1:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @8
+; R600-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @10
+; R600-NEXT: ALU 5, @14, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 8:
+; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT: Fetch clause starting at 10:
+; R600-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
+; R600-NEXT: ALU clause starting at 12:
+; R600-NEXT: MOV * T0.X, KC0[2].W,
+; R600-NEXT: ALU clause starting at 13:
+; R600-NEXT: MOV * T1.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 14:
+; R600-NEXT: SETGE_DX10 T0.W, T0.X, 1.0,
+; R600-NEXT: SETGE_DX10 * T1.W, T1.X, 0.0,
+; R600-NEXT: XOR_INT * T0.W, PS, PV.W,
+; R600-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load float, ptr addrspace(1) %in0
+ %b = load float, ptr addrspace(1) %in1
+ %acmp = fcmp oge float %a, 0.000000e+00
+ %bcmp = fcmp oge float %b, 1.000000e+00
+ %xor = xor i1 %acmp, %bcmp
+ %result = select i1 %xor, float %a, float %b
+ store float %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: v_xor_i1:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @8
+; R600-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @10
+; R600-NEXT: ALU 12, @14, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 8:
+; R600-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
+; R600-NEXT: Fetch clause starting at 10:
+; R600-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1
+; R600-NEXT: ALU clause starting at 12:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 13:
+; R600-NEXT: MOV * T1.X, KC0[2].W,
+; R600-NEXT: ALU clause starting at 14:
+; R600-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
+; R600-NEXT: XOR_INT * T1.W, T0.X, T1.X,
+; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; R600-NEXT: AND_INT T1.W, PS, 1,
+; R600-NEXT: LSHL * T0.W, PV.W, literal.x,
+; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; R600-NEXT: LSHL T0.X, PV.W, PS,
+; R600-NEXT: LSHL * T0.W, literal.x, PS,
+; R600-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; R600-NEXT: MOV T0.Y, 0.0,
+; R600-NEXT: MOV * T0.Z, 0.0,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load volatile i1, ptr addrspace(1) %in0
+ %b = load volatile i1, ptr addrspace(1) %in1
+ %xor = xor i1 %a, %b
+ store i1 %xor, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: vector_xor_i32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
+; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV T0.X, KC0[2].Z,
+; R600-NEXT: MOV * T1.X, KC0[2].W,
+; R600-NEXT: ALU clause starting at 12:
+; R600-NEXT: XOR_INT T0.X, T0.X, T1.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load i32, ptr addrspace(1) %in0
+ %b = load i32, ptr addrspace(1) %in1
+ %result = xor i32 %a, %b
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; R600-LABEL: scalar_xor_i32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: NOT_INT * T1.X, KC0[2].Z,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %result = xor i32 %a, -1
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: vector_not_i32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: NOT_INT T0.X, T0.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load i32, ptr addrspace(1) %in0
+ %b = load i32, ptr addrspace(1) %in1
+ %result = xor i32 %a, -1
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: vector_xor_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
+; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV T0.X, KC0[2].Z,
+; R600-NEXT: MOV * T1.X, KC0[2].W,
+; R600-NEXT: ALU clause starting at 12:
+; R600-NEXT: XOR_INT * T0.Y, T0.Y, T1.Y,
+; R600-NEXT: XOR_INT T0.X, T0.X, T1.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load i64, ptr addrspace(1) %in0
+ %b = load i64, ptr addrspace(1) %in1
+ %result = xor i64 %a, %b
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+; R600-LABEL: scalar_xor_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: XOR_INT * T0.Y, KC0[3].X, KC0[3].Z,
+; R600-NEXT: XOR_INT * T0.X, KC0[2].W, KC0[3].Y,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %result = xor i64 %a, %b
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
+; R600-LABEL: scalar_not_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: NOT_INT * T0.Y, KC0[3].X,
+; R600-NEXT: NOT_INT T0.X, KC0[2].W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %result = xor i64 %a, -1
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; R600-LABEL: vector_not_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: NOT_INT * T0.Y, T0.Y,
+; R600-NEXT: NOT_INT T0.X, T0.X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %a = load i64, ptr addrspace(1) %in0
+ %b = load i64, ptr addrspace(1) %in1
+ %result = xor i64 %a, -1
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
+; R600-LABEL: xor_cf:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[]
+; R600-NEXT: JUMP @5 POP:1
+; R600-NEXT: ALU 0, @19, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @12
+; R600-NEXT: ALU_POP_AFTER 1, @20, KC0[], KC1[]
+; R600-NEXT: ALU_PUSH_BEFORE 2, @22, KC0[CB0:0-32], KC1[]
+; R600-NEXT: JUMP @8 POP:1
+; R600-NEXT: ALU_POP_AFTER 5, @25, KC0[CB0:0-32], KC1[]
+; R600-NEXT: ALU 1, @31, KC0[], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 12:
+; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 14:
+; R600-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X,
+; R600-NEXT: MOV * T1.W, literal.x,
+; R600-NEXT: 1(1.401298e-45), 0(0.000000e+00)
+; R600-NEXT: SETNE_INT * T0.W, PV.W, 0.0,
+; R600-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+; R600-NEXT: ALU clause starting at 19:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 20:
+; R600-NEXT: MOV * T1.W, literal.x,
+; R600-NEXT: 0(0.000000e+00), 0(0.000000e+00)
+; R600-NEXT: ALU clause starting at 22:
+; R600-NEXT: MOV T0.W, KC0[2].Y,
+; R600-NEXT: SETE_INT * T1.W, T1.W, 0.0,
+; R600-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
+; R600-NEXT: ALU clause starting at 25:
+; R600-NEXT: MOV T1.W, KC0[2].W,
+; R600-NEXT: MOV * T2.W, KC0[3].Y,
+; R600-NEXT: XOR_INT T0.X, PV.W, PS,
+; R600-NEXT: MOV T1.W, KC0[3].X,
+; R600-NEXT: MOV * T2.W, KC0[3].Z,
+; R600-NEXT: XOR_INT * T0.Y, PV.W, PS,
+; R600-NEXT: ALU clause starting at 31:
+; R600-NEXT: LSHR * T1.X, T0.W, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = xor i64 %a, %b
+ br label %endif
+
+else:
+ %2 = load i64, ptr addrspace(1) %in
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; R600-LABEL: scalar_xor_literal_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: XOR_INT * T0.Y, KC0[5].X, literal.x,
+; R600-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
+; R600-NEXT: XOR_INT T0.X, KC0[4].W, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
+ %or = xor i64 %a, 4261135838621753
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
+; R600-LABEL: scalar_xor_literal_multi_use_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 6:
+; R600-NEXT: ADDC_UINT * T0.W, KC0[5].Y, literal.x,
+; R600-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
+; R600-NEXT: ADD_INT T0.X, KC0[5].Y, literal.x,
+; R600-NEXT: ADD_INT * T0.W, KC0[5].Z, PV.W,
+; R600-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
+; R600-NEXT: ADD_INT T1.X, PV.W, literal.x,
+; R600-NEXT: MOV * T2.X, literal.y,
+; R600-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
+; R600-NEXT: XOR_INT * T3.Y, KC0[5].X, literal.x,
+; R600-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
+; R600-NEXT: XOR_INT T3.X, KC0[4].W, literal.x,
+; R600-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
+; R600-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
+ %or = xor i64 %a, 4261135838621753
+ store i64 %or, ptr addrspace(1) %out
+
+ %foo = add i64 %b, 4261135838621753
+ store volatile i64 %foo, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; R600-LABEL: scalar_xor_inline_imm_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.Y, KC0[5].X,
+; R600-NEXT: XOR_INT T0.X, KC0[4].W, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 63(8.828180e-44), 2(2.802597e-45)
+ %or = xor i64 %a, 63
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; R600-LABEL: scalar_xor_neg_inline_imm_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: NOT_INT * T0.Y, KC0[5].X,
+; R600-NEXT: XOR_INT T0.X, KC0[4].W, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: -8(nan), 2(2.802597e-45)
+ %or = xor i64 %a, -8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; R600-LABEL: vector_xor_i64_neg_inline_imm:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: NOT_INT * T0.Y, T0.Y,
+; R600-NEXT: XOR_INT T0.X, T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: -8(nan), 2(2.802597e-45)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = xor i64 %loada, -8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; R600-LABEL: vector_xor_literal_i64:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: XOR_INT * T0.Y, T0.Y, literal.x,
+; R600-NEXT: 5231(7.330192e-42), 0(0.000000e+00)
+; R600-NEXT: XOR_INT T0.X, T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45)
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = xor i64 %loada, 22470723082367
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 1315c0b52af43..6fd5f40f6eba5 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -1,16 +1,49 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
-
-
-; FUNC-LABEL: {{^}}xor_v2i32:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s
define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: xor_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, v3, v1
+; SI-NEXT: v_xor_b32_e32 v0, v2, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: xor_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v1, v1, v3
+; VI-NEXT: v_xor_b32_e32 v0, v0, v2
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %in0
%b = load <2 x i32>, ptr addrspace(1) %in1
%result = xor <2 x i32> %a, %b
@@ -18,18 +51,52 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
ret void
}
-; FUNC-LABEL: {{^}}xor_v4i32:
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: xor_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v3, v7, v3
+; SI-NEXT: v_xor_b32_e32 v2, v6, v2
+; SI-NEXT: v_xor_b32_e32 v1, v5, v1
+; SI-NEXT: v_xor_b32_e32 v0, v4, v0
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: xor_v4i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v8, s4
+; VI-NEXT: v_mov_b32_e32 v9, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v3, v3, v7
+; VI-NEXT: v_xor_b32_e32 v2, v2, v6
+; VI-NEXT: v_xor_b32_e32 v1, v1, v5
+; VI-NEXT: v_xor_b32_e32 v0, v0, v4
+; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; VI-NEXT: s_endpgm
%a = load <4 x i32>, ptr addrspace(1) %in0
%b = load <4 x i32>, ptr addrspace(1) %in1
%result = xor <4 x i32> %a, %b
@@ -37,16 +104,54 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
ret void
}
-; FUNC-LABEL: {{^}}xor_i1:
-; EG: XOR_INT {{\** *}}{{T[0-9]+\.[XYZW]}}, {{PS|PV\.[XYZW]}}, {{PS|PV\.[XYZW]}}
-
-; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}}
-; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
-; SI: s_xor_b64 [[XOR:vcc]], [[CMP1]], [[CMP0]]
-; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: xor_i1:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1
+; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: xor_i1:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v2
+; VI-NEXT: s_xor_b64 vcc, vcc, s[0:1]
+; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%a = load float, ptr addrspace(1) %in0
%b = load float, ptr addrspace(1) %in1
%acmp = fcmp oge float %a, 0.000000e+00
@@ -57,13 +162,50 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
ret void
}
-; FUNC-LABEL: {{^}}v_xor_i1:
-; SI: buffer_load_ubyte [[B:v[0-9]+]]
-; SI: buffer_load_ubyte [[A:v[0-9]+]]
-; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[B]], [[A]]
-; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
-; SI: buffer_store_byte [[RESULT]]
define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: v_xor_i1:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_xor_b32_e32 v0, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 1, v0
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_xor_i1:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_load_ubyte v4, v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_load_ubyte v2, v[2:3] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_xor_b32_e32 v2, v4, v2
+; VI-NEXT: v_and_b32_e32 v2, 1, v2
+; VI-NEXT: flat_store_byte v[0:1], v2
+; VI-NEXT: s_endpgm
%a = load volatile i1, ptr addrspace(1) %in0
%b = load volatile i1, ptr addrspace(1) %in1
%xor = xor i1 %a, %b
@@ -71,9 +213,46 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
ret void
}
-; FUNC-LABEL: {{^}}vector_xor_i32:
-; SI: v_xor_b32_e32
define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: vector_xor_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v0, v0, v1
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: vector_xor_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v2, v4, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, %b
@@ -81,25 +260,96 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; FUNC-LABEL: {{^}}scalar_xor_i32:
-; SI: s_xor_b32
define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+; SI-LABEL: scalar_xor_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_xor_b32 s0, s2, s3
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_xor_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s2, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%result = xor i32 %a, %b
store i32 %result, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}scalar_not_i32:
-; SI: s_not_b32
define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
+; SI-LABEL: scalar_not_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_not_b32 s4, s4
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_not_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_not_b32 s2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%result = xor i32 %a, -1
store i32 %result, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}vector_not_i32:
-; SI: v_not_b32
define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: vector_not_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_not_b32_e32 v0, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: vector_not_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_not_b32_e32 v2, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in0
%b = load i32, ptr addrspace(1) %in1
%result = xor i32 %a, -1
@@ -107,11 +357,48 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; FUNC-LABEL: {{^}}vector_xor_i64:
-; SI: v_xor_b32_e32
-; SI: v_xor_b32_e32
-; SI: s_endpgm
define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: vector_xor_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: s_mov_b32 s14, s2
+; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v0, v2, v0
+; SI-NEXT: v_xor_b32_e32 v1, v3, v1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: vector_xor_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v0, v0, v2
+; VI-NEXT: v_xor_b32_e32 v1, v1, v3
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, %b
@@ -119,27 +406,104 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; FUNC-LABEL: {{^}}scalar_xor_i64:
-; SI: s_xor_b64
-; SI: s_endpgm
define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+; SI-LABEL: scalar_xor_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_xor_b64 s[4:5], s[6:7], s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_xor_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
%result = xor i64 %a, %b
store i64 %result, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}scalar_not_i64:
-; SI: s_not_b64
define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
+; SI-LABEL: scalar_not_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_not_b64 s[0:1], s[2:3]
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_not_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_not_b64 s[0:1], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
%result = xor i64 %a, -1
store i64 %result, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}vector_not_i64:
-; SI: v_not_b32
-; SI: v_not_b32
define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; SI-LABEL: vector_not_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_not_b32_e32 v0, v0
+; SI-NEXT: v_not_b32_e32 v1, v1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: vector_not_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_not_b32_e32 v0, v0
+; VI-NEXT: v_not_b32_e32 v1, v1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in0
%b = load i64, ptr addrspace(1) %in1
%result = xor i64 %a, -1
@@ -147,13 +511,65 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
ret void
}
-; Test that we have a pattern to match xor inside a branch.
-; Note that in the future the backend may be smart enough to
-; use an SALU instruction for this.
-
-; FUNC-LABEL: {{^}}xor_cf:
-; SI: s_xor_b64
define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
+; SI-LABEL: xor_cf:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b64 s[8:9], 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
+; SI-NEXT: s_and_b64 vcc, exec, s[10:11]
+; SI-NEXT: s_cbranch_vccz .LBB12_4
+; SI-NEXT: ; %bb.1: ; %else
+; SI-NEXT: s_mov_b32 s15, 0xf000
+; SI-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
+; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; SI-NEXT: s_cbranch_vccnz .LBB12_3
+; SI-NEXT: .LBB12_2: ; %if
+; SI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: .LBB12_3: ; %endif
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB12_4:
+; SI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SI-NEXT: s_branch .LBB12_2
+;
+; VI-LABEL: xor_cf:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cbranch_scc0 .LBB12_4
+; VI-NEXT: ; %bb.1: ; %else
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: s_cbranch_vccnz .LBB12_3
+; VI-NEXT: .LBB12_2: ; %if
+; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: .LBB12_3: ; %endif
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+; VI-NEXT: .LBB12_4:
+; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
+; VI-NEXT: s_branch .LBB12_2
entry:
%0 = icmp eq i64 %a, 0
br i1 %0, label %if, label %else
@@ -172,27 +588,82 @@ endif:
ret void
}
-; FUNC-LABEL: {{^}}scalar_xor_literal_i64:
-; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
-; SI-DAG: s_xor_b32 s[[RES_HI:[0-9]+]], s{{[0-9]+}}, 0xf237b
-; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s{{[0-9]+}}, 0x3039
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; SI-LABEL: scalar_xor_literal_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s5, s5, 0xf237b
+; SI-NEXT: s_xor_b32 s4, s4, 0x3039
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_xor_literal_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s3, s3, 0xf237b
+; VI-NEXT: s_xor_b32 s2, s2, 0x3039
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}scalar_xor_literal_multi_use_i64:
-; SI: s_load_dwordx4 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
-; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
-; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]]
-
-; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039
-; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b
define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
+; SI-LABEL: scalar_xor_literal_multi_use_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x13
+; SI-NEXT: s_movk_i32 s8, 0x3039
+; SI-NEXT: s_mov_b32 s9, 0xf237b
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: s_add_u32 s0, s2, 0x3039
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_addc_u32 s1, s3, 0xf237b
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_xor_literal_multi_use_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s2, 0x3039
+; VI-NEXT: s_mov_b32 s3, 0xf237b
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_u32 s0, s6, 0x3039
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_addc_u32 s1, s7, 0xf237b
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
store i64 %or, ptr addrspace(1) %out
@@ -201,51 +672,146 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
ret void
}
-; FUNC-LABEL: {{^}}scalar_xor_inline_imm_i64:
-; SI: s_load_dwordx2 s[[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
-; SI-NOT: xor_b32
-; SI: s_xor_b32 s[[VAL_LO]], s{{[0-9]+}}, 63
-; SI-NOT: xor_b32
-; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], s{{[0-9]+}}
-; SI-NOT: xor_b32
-; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s{{[0-9]+}}
-; SI-NOT: xor_b32
-; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; SI-LABEL: scalar_xor_inline_imm_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 63
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_xor_inline_imm_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s2, s2, 63
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%or = xor i64 %a, 63
store i64 %or, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
-; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
-; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -8
define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; SI-LABEL: scalar_xor_neg_inline_imm_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -8
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_xor_neg_inline_imm_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
%or = xor i64 %a, -8
store i64 %or, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}vector_xor_i64_neg_inline_imm:
-; SI: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
-; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
-; SI: s_endpgm
define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SI-LABEL: vector_xor_i64_neg_inline_imm:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v0, -8, v0
+; SI-NEXT: v_xor_b32_e32 v1, -1, v1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: vector_xor_i64_neg_inline_imm:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v0, -8, v0
+; VI-NEXT: v_xor_b32_e32 v1, -1, v1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, -8
store i64 %or, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}vector_xor_literal_i64:
-; SI-DAG: buffer_load_dwordx2 v[[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]],
-; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
-; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
-; SI: s_endpgm
define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; SI-LABEL: vector_xor_literal_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, 0x146f, v1
+; SI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: vector_xor_literal_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v1, 0x146f, v1
+; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%loada = load i64, ptr addrspace(1) %a, align 8
%or = xor i64 %loada, 22470723082367
store i64 %or, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
>From cd55fe4e99f4de835905c6d95ec86ccc258cbc1e Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 4 Jun 2024 10:24:02 +0530
Subject: [PATCH 2/7] Fixed test CodeGen/AMDGPU/packed-op-sel.ll.
---
llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 628 +++++++++++++---------
1 file changed, 372 insertions(+), 256 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index 4d6adc7cc9417..5417445fdae43 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -1,17 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_lo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -28,18 +32,21 @@ bb:
}
; Apply fneg to broadcasted vector
-; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -57,18 +64,21 @@ bb:
}
; Apply fneg before broadcast
-; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -86,18 +96,21 @@ bb:
}
; Apply fneg before and after broadcast, and should cancel out.
-; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -116,18 +129,21 @@ bb:
}
; Add scalar, but negate low component
-; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -144,18 +160,21 @@ bb:
}
; Add scalar, but negate high component
-; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -172,17 +191,20 @@ bb:
}
; Apply fneg before broadcast with bitcast
-; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: ds_read_u16 v1, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
%scalar0 = load volatile half, ptr addrspace(3) %arg2, align 2
@@ -197,19 +219,26 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
-; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
-
-; FIXME: Remove and
-; GCN-DAG: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
-; GCN-DAG: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
-; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v3, v1
+; GCN-NEXT: ds_read_u16 v1, v1 offset:4
+; GCN-NEXT: s_waitcnt lgkmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1
+; GCN-NEXT: global_store_dword v4, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%arg2.gep = getelementptr inbounds half, ptr addrspace(3) %arg2, i32 2
@@ -229,15 +258,23 @@ bb:
}
; FIXME: Can we avoid waitcnt between the two halves?
-; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_u16 [[PACKED:v[0-9]+]]
-; GCN: s_waitcnt
-; GCN: ds_read_u16_d16_hi [[PACKED]]
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: ds_read_u16 v3, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_read_u16_d16_hi v3, v1 offset:4
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%arg2.gep = getelementptr inbounds half, ptr addrspace(3) %arg2, i32 2
@@ -257,18 +294,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_neg_vector_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -285,18 +325,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_vector_neg_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -314,17 +357,20 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}add_vector_scalar_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}}
define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: add_vector_scalar_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
+; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
@@ -338,18 +384,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -366,18 +415,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}}
define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_neg_vector_lo_neg_hi:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -396,18 +448,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_swap_vector:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_swap_vector:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -423,19 +478,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-; GCN-NOT: xor
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_swap_neg_vector:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -452,19 +509,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-; GCN-NOT: xor
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_0:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -480,19 +539,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-; GCN-NOT: xor
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_1:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -508,19 +569,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-; GCN-NOT: xor
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_2:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -536,19 +599,21 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: or
-; GCN-NOT: xor
-
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}}
define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_3:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -564,9 +629,22 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}bitcast_fneg_f32:
-; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: bitcast_fneg_f32:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_pk_add_f16 v0, v0, v1
+; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
%f32 = load volatile float, ptr addrspace(3) undef, align 4
@@ -578,9 +656,22 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32:
-; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}}
define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: shuffle_bitcast_fneg_f32:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0]
+; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -593,10 +684,24 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}extract_from_i64:
-; GCN: v_lshl_or_b32
-; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}}
define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: extract_from_i64:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v3, 0xffff
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v2, v0
+; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_pk_add_u16 v0, v2, v0
+; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
%i64 = load volatile i64, ptr addrspace(1) undef
@@ -612,21 +717,24 @@ bb:
ret void
}
-
-; Bitcast is final obstacle to identifying same source register
-; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: _or
-
-; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: bitcast_lo_elt_op_sel:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
@@ -647,21 +755,29 @@ bb:
ret void
}
-
-; Bitcast is final obstacle to identifying same source register
-; GCN-LABEL: {{^}}mix_elt_types_op_sel:
-; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
-; GCN: ds_read_b32 [[VEC2:v[0-9]+]]
-
-; GCN-NOT: pack
-; GCN-NOT: and
-; GCN-NOT: shl
-; GCN-NOT: _or
-
-; GCN: v_pk_add_f16 [[FADD:v[0-9]+]]
-; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}}
define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
+; GCN-LABEL: mix_elt_types_op_sel:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: ds_read_b32 v1, v0
+; GCN-NEXT: ds_read_b32 v2, v0 offset:4
+; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: ; kill: killed $vgpr0_vgpr1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; kill: killed $vgpr0_vgpr1
+; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
%lds.gep2 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 2
>From 12be1de22099757fc1fb06cc4f7e105c27a5ba29 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Mon, 3 Jun 2024 15:00:02 +0530
Subject: [PATCH 3/7] Fixed test CodeGen/AMDGPU/llvm.read.local.size.ll.
---
.../AMDGPU/llvm.r600.read.local.size.ll | 189 ++++++-----
.../CodeGen/AMDGPU/llvm.read.local.size.ll | 321 ++++++++++++++++++
2 files changed, 424 insertions(+), 86 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.read.local.size.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index ab035b9de04b9..43a052ae20f22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -1,63 +1,68 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN,SI-NOHSA,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,VI-NOHSA,GCN,FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
-
-; FUNC-LABEL: {{^}}local_size_x:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[1].Z
-
-; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
-; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
-
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_x:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MOV * T1.X, KC0[1].Z,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.x() #0
store i32 %0, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}local_size_y:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[1].W
-
-; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_y:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MOV * T1.X, KC0[1].W,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.y() #0
store i32 %0, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}local_size_z:
-; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; EG: MOV * [[VAL]], KC0[2].X
-
-; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_z:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MOV * T1.X, KC0[2].X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.z() #0
store i32 %0, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}local_size_xy:
-; SI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9+]]]], s[0:1], 0x6
-; VI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9+]]]], s[0:1], 0x18
-; GCN: s_mul_i32 [[VAL:s[0-9]+]], s[[X]], s[[Y]]
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_xy:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[1].W,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
@@ -66,17 +71,17 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_size_xz:
-
-; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
-; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
-; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
-; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
-; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
-; GCN: s_mul_i32 [[VAL:s[0-9]+]], [[X]], [[Z]]
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_xz:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[2].X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
@@ -85,16 +90,17 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_size_yz:
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_dispatch_ptr = 1
-
-; SI-NOHSA-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x7
-; VI-NOHSA-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x1c
-; GCN: s_mul_i32 [[VAL:s[0-9]+]], s[[#LOAD + 0]], s[[#LOAD + 1]]
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_yz:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MULLO_INT * T1.X, KC0[1].W, KC0[2].X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%y = call i32 @llvm.r600.read.local.size.y() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
@@ -103,19 +109,18 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_size_xyz:
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_dispatch_ptr = 1
-
-; SI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s[0:1], 0x6
-; SI-NOHSA-DAG: s_load_dword s[[Z:[0-9]+]], s[0:1], 0x8
-; VI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s[0:1], 0x18
-; VI-NOHSA-DAG: s_load_dword s[[Z:[0-9]+]], s[0:1], 0x20
-; GCN: s_mul_i32 [[M:s[0-9]+]], s[[X]], s[[Y]]
-; GCN: s_add_i32 [[VAL:s[0-9]+]], [[M]], s[[Z]]
-; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_xyz:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MULLO_INT * T0.X, KC0[1].Z, KC0[1].W,
+; R600-NEXT: ADD_INT T0.X, PS, KC0[2].X,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
@@ -126,13 +131,17 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_size_x_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_x_known_bits:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y,
+; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.x() #0
%shl = shl i32 %size, 16
@@ -141,13 +150,17 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_size_y_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_y_known_bits:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: AND_INT * T1.X, KC0[1].W, literal.y,
+; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.y() #0
%shl = shl i32 %size, 16
@@ -156,13 +169,17 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}local_size_z_known_bits:
-; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
-; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
-; GCN-NOT: 0xffff
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
+; R600-LABEL: local_size_z_known_bits:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: AND_INT * T1.X, KC0[2].X, literal.y,
+; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.z() #0
%shl = shl i32 %size, 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.read.local.size.ll
new file mode 100644
index 0000000000000..cdee68d150813
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.read.local.size.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,GCN %s
+
+define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_x:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_x:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x18
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.x() #0
+ store i32 %0, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_y:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_y:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x1c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.y() #0
+ store i32 %0, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_z:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_z:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x20
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %0 = call i32 @llvm.r600.read.local.size.z() #0
+ store i32 %0, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xy:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xy:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s2, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %x = call i32 @llvm.r600.read.local.size.x() #0
+ %y = call i32 @llvm.r600.read.local.size.y() #0
+ %val = mul i32 %x, %y
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s2, s[0:1], 0x6
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s2, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s2, s[0:1], 0x18
+; VI-NEXT: s_load_dword s3, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s2, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %x = call i32 @llvm.r600.read.local.size.x() #0
+ %z = call i32 @llvm.r600.read.local.size.z() #0
+ %val = mul i32 %x, %z
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_yz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s0, s0, s1
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_yz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %y = call i32 @llvm.r600.read.local.size.y() #0
+ %z = call i32 @llvm.r600.read.local.size.z() #0
+ %val = mul i32 %y, %z
+ store i32 %val, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xyz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6
+; SI-NEXT: s_load_dword s2, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_add_i32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xyz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
+; VI-NEXT: s_load_dword s4, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s2, s2, s3
+; VI-NEXT: s_add_i32 s2, s2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %x = call i32 @llvm.r600.read.local.size.x() #0
+ %y = call i32 @llvm.r600.read.local.size.y() #0
+ %z = call i32 @llvm.r600.read.local.size.z() #0
+ %xy = mul i32 %x, %y
+ %xyz = add i32 %xy, %z
+ store i32 %xyz, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_x_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_x_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x18
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %size = call i32 @llvm.r600.read.local.size.x() #0
+ %shl = shl i32 %size, 16
+ %shr = lshr i32 %shl, 16
+ store i32 %shr, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_y_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_y_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x1c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %size = call i32 @llvm.r600.read.local.size.y() #0
+ %shl = shl i32 %size, 16
+ %shr = lshr i32 %shl, 16
+ store i32 %shr, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_z_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_z_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x20
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+entry:
+ %size = call i32 @llvm.r600.read.local.size.z() #0
+ %shl = shl i32 %size, 16
+ %shr = lshr i32 %shl, 16
+ store i32 %shr, ptr addrspace(1) %out
+ ret void
+}
+
+declare i32 @llvm.r600.read.local.size.x() #0
+declare i32 @llvm.r600.read.local.size.y() #0
+declare i32 @llvm.r600.read.local.size.z() #0
+
+attributes #0 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
>From d6150709e6706f2e3c608e94c48e2ffe87629a73 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 4 Jun 2024 09:05:53 +0530
Subject: [PATCH 4/7] Fixed test CodeGen/AMDGPU/fneg.ll.
---
llvm/test/CodeGen/AMDGPU/fneg.ll | 781 +++++++++++++++++++++++++------
1 file changed, 633 insertions(+), 148 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 03ca780c90322..f88ab5458adcb 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -1,89 +1,279 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,GFX11 %s
-; RUN: not llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,R600 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
+; RUN: not llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=R600 %s
-; FUNC-LABEL: {{^}}s_fneg_f32:
-; R600: -PV
-
-; GCN: s_load_{{dword|b32}} [[VAL:s[0-9]+]]
-; GCN: s_xor_b32 [[NEG_VAL:s[0-9]+]], [[VAL]], 0x80000000
-; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[NEG_VAL]]
define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: s_fneg_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub float -0.000000e+00, %in
store float %fneg, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}s_fneg_v2f32:
-; R600: -PV
-; R600: -PV
-
-; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
-; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) {
+; SI-LABEL: s_fneg_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; SI-NEXT: s_xor_b32 s1, s2, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
+; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
store <2 x float> %fneg, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}s_fneg_v4f32:
-; R600: -PV
-; R600: -T
-; R600: -PV
-; R600: -PV
-
-; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
-; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
-; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
-; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) {
+; SI-LABEL: s_fneg_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s7, s7, 0x80000000
+; SI-NEXT: s_xor_b32 s6, s6, 0x80000000
+; SI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s2, s7, 0x80000000
+; VI-NEXT: s_xor_b32 s3, s6, 0x80000000
+; VI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000
+; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000
+; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000
+; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
store <4 x float> %fneg, ptr addrspace(1) %out
ret void
}
-; DAGCombiner will transform:
-; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
-; unless the target returns true for isNegFree()
-
-; FUNC-LABEL: {{^}}fsub0_f32:
-
-; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}}
-
-; R600-NOT: XOR
-; R600: -KC0[2].Z
define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fsub0_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_sub_f32_e64 v0, 0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fsub0_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f32_e64 v2, 0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fsub0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fsub = fsub float 0.0, %bc
store float %fsub, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}fneg_free_f32:
-; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-; GFX11: s_load_b32 [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-
-; GCN: s_xor_b32 [[RES:s[0-9]+]], [[NEG_VALUE]], 0x80000000
-; GCN: v_mov_b32_e32 [[V_RES:v[0-9]+]], [[RES]]
-; GCN: buffer_store_{{dword|b32}} [[V_RES]]
-; R600-NOT: XOR
-; R600: -PV.W
define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fneg_free_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_free_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fneg_free_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fsub = fsub float -0.0, %bc
store float %fsub, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}fneg_fold_f32:
-; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
-; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-; GFX11: s_load_{{dword|b32}} [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
-; GCN-NOT: xor
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: fneg_fold_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_f32_e64 v0, -s4, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fold_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v2, -s2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fneg_fold_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fsub = fsub float -0.0, %in
%fmul = fmul float %fsub, %in
store float %fmul, ptr addrspace(1) %out
@@ -91,9 +281,41 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
}
; Make sure we turn some integer operations back into fabs
-; FUNC-LABEL: {{^}}bitpreserve_fneg_f32:
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -4.0
define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: bitpreserve_fneg_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_f32_e64 v0, s4, -4.0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: bitpreserve_fneg_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitpreserve_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
%int.abs = xor i32 %in.bc, 2147483648
%bc = bitcast i32 %int.abs to float
@@ -102,29 +324,94 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
ret void
}
-; FUNC-LABEL: {{^}}s_fneg_i32:
-; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]]
-; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000
-; GCN: v_mov_b32_e32 [[V_FNEG:v[0-9]+]], [[FNEG]]
define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fneg_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i32 %in, -2147483648
store i32 %fneg, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_i32:
-; GCN: s_waitcnt
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: s_setpc_b64
define i32 @v_fneg_i32(i32 %in) {
+; GCN-LABEL: v_fneg_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i32 %in, -2147483648
ret i32 %fneg
}
-; FUNC-LABEL: {{^}}s_fneg_i32_fp_use:
-; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]]
-; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]]
define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fneg_i32_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_sub_f32_e64 v0, 2.0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i32_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i32_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i32 %in, -2147483648
%bitcast = bitcast i32 %fneg to float
%fadd = fadd float %bitcast, 2.0
@@ -132,37 +419,105 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_i32_fp_use:
-; GCN: s_waitcnt
-; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0
-; GCN-NEXT: s_setpc_b64
define float @v_fneg_i32_fp_use(i32 %in) {
+; GCN-LABEL: v_fneg_i32_fp_use:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i32 %in, -2147483648
%bitcast = bitcast i32 %fneg to float
%fadd = fadd float %bitcast, 2.0
ret float %fadd
}
-; FUNC-LABEL: {{^}}s_fneg_i64:
-; GCN: s_xor_b32 s[[NEG_HI:[0-9]+]], s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: s_fneg_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i64 %in, -9223372036854775808
store i64 %fneg, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_i64:
-; GCN: s_waitcnt
-; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GCN-NEXT: s_setpc_b64
define i64 @v_fneg_i64(i64 %in) {
+; GCN-LABEL: v_fneg_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i64 %in, -9223372036854775808
ret i64 %fneg
}
-; FUNC-LABEL: {{^}}s_fneg_i64_fp_use:
-; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, -s{{\[[0-9]+:[0-9]+\]}}, 2.0
define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: s_fneg_i64_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i64_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i64_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i64 %in, -9223372036854775808
%bitcast = bitcast i64 %fneg to double
%fadd = fadd double %bitcast, 2.0
@@ -170,34 +525,65 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_i64_fp_use:
-; GCN: s_waitcnt
-; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
-; GCN-NEXT: s_setpc_b64
define double @v_fneg_i64_fp_use(i64 %in) {
+; GCN-LABEL: v_fneg_i64_fp_use:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i64 %in, -9223372036854775808
%bitcast = bitcast i64 %fneg to double
%fadd = fadd double %bitcast, 2.0
ret double %fadd
}
-; FUNC-LABEL: {{^}}v_fneg_i16:
-; GCN: s_waitcnt
-; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
-; GCN-NEXT: s_setpc_b64
define i16 @v_fneg_i16(i16 %in) {
+; GCN-LABEL: v_fneg_i16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i16 %in, -32768
ret i16 %fneg
}
-; FUNC-LABEL: {{^}}s_fneg_i16_fp_use:
-; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], s{{[0-9]+}}
-; SI: v_sub_f32_e32 [[ADD:v[0-9]+]], 2.0, [[CVT0]]
-; SI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[ADD]]
-
-; VI: s_load_dword [[IN:s[0-9]+]]
-; VI: v_sub_f16_e64 v{{[0-9]+}}, 2.0, [[IN]]
define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
+; SI-LABEL: s_fneg_i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i16 %in, -32768
%bitcast = bitcast i16 %fneg to half
%fadd = fadd half %bitcast, 2.0
@@ -205,69 +591,157 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_i16_fp_use:
-; SI: s_waitcnt
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
-; SI-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
-; VI-NEXT: s_setpc_b64
define half @v_fneg_i16_fp_use(i16 %in) {
+; SI-LABEL: v_fneg_i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i16 %in, -32768
%bitcast = bitcast i16 %fneg to half
%fadd = fadd half %bitcast, 2.0
ret half %fadd
}
-; FUNC-LABEL: {{^}}s_fneg_v2i16:
-; SI: s_xor_b32 s4, s4, 0x80008000
-
-; VI: s_lshr_b32 s5, s4, 16
-; VI: s_xor_b32 s4, s4, 0x8000
-; VI: s_xor_b32 s5, s5, 0x8000
-; VI: s_and_b32 s4, s4, 0xffff
-; VI: s_lshl_b32 s5, s5, 16
-; VI: s_or_b32 s4, s4, s5
define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
+; SI-LABEL: s_fneg_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80008000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s3, s2, 16
+; VI-NEXT: s_xor_b32 s2, s2, 0x8000
+; VI-NEXT: s_xor_b32 s3, s3, 0x8000
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_lshl_b32 s3, s3, 16
+; VI-NEXT: s_or_b32 s2, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
store <2 x i16> %fneg, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_v2i16:
-; SI: v_xor_b32_e32 v1, 0x8000, v1
-; SI: v_xor_b32_e32 v0, 0x8000, v0
-; SI: v_lshlrev_b32_e32 v2, 16, v1
-; SI: v_and_b32_e32 v0, 0xffff, v0
-; SI: v_or_b32_e32 v0, v0, v2
-; SI: v_and_b32_e32 v1, 0xffff, v1
-
-; VI: s_waitcnt
-; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; VI-NEXT: s_setpc_b64
define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
+; SI-LABEL: v_fneg_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
ret <2 x i16> %fneg
}
-; FUNC-LABEL: {{^}}s_fneg_v2i16_fp_use:
-; SI: s_lshr_b32 s3, s2, 16
-; SI: v_cvt_f32_f16_e32 v0, s3
-; SI: v_cvt_f32_f16_e32 v1, s2
-; SI: v_sub_f32_e32 v0, 2.0, v0
-; SI: v_sub_f32_e32 v1, 2.0, v1
-
-; VI: s_lshr_b32 s5, s4, 16
-; VI: s_xor_b32 s5, s5, 0x8000
-; VI: s_xor_b32 s4, s4, 0x8000
-; VI: v_mov_b32_e32 v0, s5
-; VI: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI: v_add_f16_e64 v1, s4, 2.0
-; VI: v_or_b32_e32 v0, v1, v0
define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) {
+; SI-LABEL: s_fneg_v2i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v0, 0x4000
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s3, s2, 16
+; VI-NEXT: s_xor_b32 s3, s3, 0x8000
+; VI-NEXT: s_xor_b32 s2, s2, 0x8000
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_f16_e64 v1, s2, 2.0
+; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v1, v0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
%bitcast = bitcast <2 x i16> %fneg to <2 x half>
@@ -276,20 +750,31 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_v2i16_fp_use:
-; SI: v_lshrrev_b32_e32 v1, 16, v0
-; SI: v_cvt_f32_f16_e32 v0, v0
-; SI: v_cvt_f32_f16_e32 v1, v1
-; SI: v_sub_f32_e32 v0, 2.0, v0
-; SI: v_sub_f32_e32 v1, 2.0, v1
-
-; VI: s_waitcnt
-; VI: v_mov_b32_e32 v1, 0x4000
-; VI: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_sub_f16_e32 v0, 2.0, v0
-; VI: v_or_b32_e32 v0, v0, v1
-; VI: s_setpc_b64
define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) {
+; SI-LABEL: v_fneg_v2i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_v2i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, 0x4000
+; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_v2i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
%bitcast = bitcast <2 x i16> %fneg to <2 x half>
>From bfdf74aeee004de3c3c8eb9f5ef3a4632a0e40ec Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 4 Jun 2024 09:51:31 +0530
Subject: [PATCH 5/7] Fixed test CodeGen/AMDGPU/fneg-fabs.ll.
---
llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll | 180 ++++++++++++++
llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 258 +++++++++++++++++----
2 files changed, 393 insertions(+), 45 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll
new file mode 100644
index 0000000000000..4f5271ed23252
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs-r600.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 %s
+
+define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) {
+; R600-LABEL: fneg_fabsf_fadd_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: ADD * T1.X, KC0[2].W, -|KC0[2].Z|,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %fsub = fsub float -0.000000e+00, %fabs
+ %fadd = fadd float %y, %fsub
+ store float %fadd, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) {
+; R600-LABEL: fneg_fabsf_fmul_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MUL_IEEE * T1.X, KC0[2].W, -|KC0[2].Z|,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %fsub = fsub float -0.000000e+00, %fabs
+ %fmul = fmul float %y, %fsub
+ store float %fmul, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
+; R600-LABEL: fneg_fabsf_free_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: MOV * T0.W, |PV.W|,
+; R600-NEXT: MOV T0.X, -PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %bc = bitcast i32 %in to float
+ %fabs = call float @llvm.fabs.f32(float %bc)
+ %fsub = fsub float -0.000000e+00, %fabs
+ store float %fsub, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) {
+; R600-LABEL: fneg_fabsf_fn_free_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: MOV * T0.W, |PV.W|,
+; R600-NEXT: MOV T0.X, -PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %bc = bitcast i32 %in to float
+ %fabs = call float @fabsf(float %bc)
+ %fsub = fsub float -0.000000e+00, %fabs
+ store float %fsub, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: fneg_fabsf_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: MOV * T0.W, |PV.W|,
+; R600-NEXT: MOV T0.X, -PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call float @llvm.fabs.f32(float %in)
+ %fsub = fsub float -0.000000e+00, %fabs
+ store float %fsub, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; R600-LABEL: v_fneg_fabsf_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: MOV * T0.W, |T0.X|,
+; R600-NEXT: MOV T0.X, -PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %val = load float, ptr addrspace(1) %in, align 4
+ %fabs = call float @llvm.fabs.f32(float %val)
+ %fsub = fsub float -0.000000e+00, %fabs
+ store float %fsub, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: fneg_fabsf_v2f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV T0.W, KC0[3].X,
+; R600-NEXT: MOV * T1.W, KC0[2].W,
+; R600-NEXT: MOV * T0.W, |PV.W|,
+; R600-NEXT: MOV T0.Y, -PV.W,
+; R600-NEXT: MOV * T0.W, |T1.W|,
+; R600-NEXT: MOV T0.X, -PV.W,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
+ store <2 x float> %fsub, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
+; R600-LABEL: fneg_fabsf_v4f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[4].X,
+; R600-NEXT: MOV T0.W, |PV.W|,
+; R600-NEXT: MOV * T1.W, KC0[3].W,
+; R600-NEXT: MOV T0.Z, KC0[3].Z,
+; R600-NEXT: MOV T1.W, |PS|,
+; R600-NEXT: MOV * T2.W, -PV.W,
+; R600-NEXT: MOV T2.Z, -PV.W,
+; R600-NEXT: MOV T0.W, KC0[3].Y,
+; R600-NEXT: MOV * T1.W, |PV.Z|,
+; R600-NEXT: MOV T2.Y, -PS,
+; R600-NEXT: MOV * T0.W, |PV.W|,
+; R600-NEXT: MOV T2.X, -PV.W,
+; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+ %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
+ store <4 x float> %fsub, ptr addrspace(1) %out
+ ret void
+}
+
+declare float @fabsf(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index b0c17828cb13b..63b44a18c1d25 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,11 +1,31 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,FUNC %s
-; FUNC-LABEL: {{^}}fneg_fabsf_fadd_f32:
-; SI-NOT: and
-; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, float %y) {
+; SI-LABEL: fneg_fabsf_fadd_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_sub_f32_e64 v0, s3, |v0|
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_fadd_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_sub_f32_e64 v2, s3, |v0|
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %x)
%fsub = fsub float -0.000000e+00, %fabs
%fadd = fadd float %y, %fsub
@@ -13,11 +33,30 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x,
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabsf_fmul_f32:
-; SI-NOT: and
-; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
-; SI-NOT: and
define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, float %y) {
+; SI-LABEL: fneg_fabsf_fmul_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mul_f32_e64 v0, s3, -|v0|
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_fmul_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0|
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %x)
%fsub = fsub float -0.000000e+00, %fabs
%fmul = fmul float %y, %fsub
@@ -25,18 +64,30 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x,
ret void
}
-; DAGCombiner will transform:
-; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
-; unless isFabsFree returns true
-
-; FUNC-LABEL: {{^}}fneg_fabsf_free_f32:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-; R600: -PV
-
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; VI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fneg_fabsf_free_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_or_b32 s4, s2, 0x80000000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_free_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitset1_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
%fsub = fsub float -0.000000e+00, %fabs
@@ -44,13 +95,30 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabsf_fn_free_f32:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-; R600: -PV
-
-; SI: s_load_dwordx2 s[0:1], s[2:3], 0x9
define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fneg_fabsf_fn_free_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s4, 31
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_fn_free_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_or_b32 s2, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fabs = call float @fabsf(float %bc)
%fsub = fsub float -0.000000e+00, %fabs
@@ -58,18 +126,68 @@ define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabsf_f32:
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: fneg_fabsf_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_or_b32 s4, s2, 0x80000000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitset1_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
%fsub = fsub float -0.000000e+00, %fabs
store float %fsub, ptr addrspace(1) %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}v_fneg_fabsf_f32:
-; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_fneg_fabsf_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_fneg_fabsf_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dword v2, v[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_or_b32_e32 v2, 0x80000000, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %in, align 4
%fabs = call float @llvm.fabs.f32(float %val)
%fsub = fsub float -0.000000e+00, %fabs
@@ -77,28 +195,76 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabsf_v2f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: -PV
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: -PV
-
-; FIXME: In this case two uses of the constant should be folded
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; SI-LABEL: fneg_fabsf_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s3, 31
+; SI-NEXT: s_bitset1_b32 s2, 31
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitset1_b32 s3, 31
+; VI-NEXT: s_bitset1_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
%fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
store <2 x float> %fsub, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}fneg_fabsf_v4f32:
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
+; SI-LABEL: fneg_fabsf_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s7, 31
+; SI-NEXT: s_bitset1_b32 s6, 31
+; SI-NEXT: s_bitset1_b32 s5, 31
+; SI-NEXT: s_bitset1_b32 s4, 31
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabsf_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_or_b32 s2, s7, 0x80000000
+; VI-NEXT: s_or_b32 s3, s6, 0x80000000
+; VI-NEXT: s_bitset1_b32 s5, 31
+; VI-NEXT: s_bitset1_b32 s4, 31
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
%fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
store <4 x float> %fsub, ptr addrspace(1) %out
@@ -112,3 +278,5 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
>From 11780c3434615e54ddc7627bac7af67e6f6289f2 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 4 Jun 2024 09:54:51 +0530
Subject: [PATCH 6/7] Fixed test CodeGen/AMDGPU/fneg-fabs.f64.ll.
---
llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 267 +++++++++++++++++++---
1 file changed, 237 insertions(+), 30 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 5f1d232daabe5..76fdcf94e0497 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -1,12 +1,35 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GCN %s
-; FIXME: Check something here. Currently it seems fabs + fneg aren't
-; into 2 modifiers, although theoretically that should work.
-
-; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
-; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x, double %y) {
+; SI-LABEL: fneg_fabs_fadd_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_add_f64 v[0:1], s[8:9], -|v[0:1]|
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_fadd_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%fabs = call double @llvm.fabs.f64(double %x)
%fsub = fsub double -0.000000e+00, %fabs
%fadd = fadd double %y, %fsub
@@ -15,6 +38,29 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x,
}
define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrspace(1) %xptr, ptr addrspace(1) %yptr) {
+; SI-LABEL: v_fneg_fabs_fadd_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_add_f64 v[0:1], s[4:5], -|s[4:5]|
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: v_fneg_fabs_fadd_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|s[2:3]|
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%x = load double, ptr addrspace(1) %xptr, align 8
%y = load double, ptr addrspace(1) %xptr, align 8
%fabs = call double @llvm.fabs.f64(double %x)
@@ -24,9 +70,34 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs
ret void
}
-; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
-; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x, double %y) {
+; SI-LABEL: fneg_fabs_fmul_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s0, s4
+; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: v_mul_f64 v[0:1], s[8:9], -|v[0:1]|
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_fmul_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mul_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%fabs = call double @llvm.fabs.f64(double %x)
%fsub = fsub double -0.000000e+00, %fabs
%fmul = fmul double %y, %fsub
@@ -34,8 +105,32 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x,
ret void
}
-; GCN-LABEL: {{^}}fneg_fabs_free_f64:
define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: fneg_fabs_free_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s3, 31
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_free_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
%bc = bitcast i64 %in to double
%fabs = call double @llvm.fabs.f64(double %bc)
%fsub = fsub double -0.000000e+00, %fabs
@@ -43,10 +138,32 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
ret void
}
-; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
-; SI: s_bitset1_b32
-; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: fneg_fabs_fn_free_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s3, 31
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_fn_free_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT: s_endpgm
%bc = bitcast i64 %in to double
%fabs = call double @fabs(double %bc)
%fsub = fsub double -0.000000e+00, %fabs
@@ -54,38 +171,126 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in)
ret void
}
-; GCN-LABEL: {{^}}fneg_fabs_f64:
-; SI-DAG: s_load_dwordx2 s[[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x13
-; VI-DAG: s_load_dwordx2 s[[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x4c
-; GCN-DAG: s_bitset1_b32 s[[HI_X]], 31
-; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
-; GCN-DAG: v_mov_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]]
-; GCN: buffer_store_dwordx2 v[[[LO_V]]:[[HI_V]]]
define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %in) {
+; SI-LABEL: fneg_fabs_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x13
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s5, 31
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitset1_b32 s3, 31
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%fabs = call double @llvm.fabs.f64(double %in)
%fsub = fsub double -0.000000e+00, %fabs
store double %fsub, ptr addrspace(1) %out, align 8
ret void
}
-; GCN-LABEL: {{^}}fneg_fabs_v2f64:
-; GCN-NOT: 0x80000000
-; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
-; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) {
+; SI-LABEL: fneg_fabs_v2f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s7, 31
+; SI-NEXT: s_bitset1_b32 s5, 31
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_v2f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_or_b32 s2, s7, 0x80000000
+; VI-NEXT: s_or_b32 s3, s5, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
%fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
store <2 x double> %fsub, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}fneg_fabs_v4f64:
-; GCN-NOT: 0x80000000
-; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
-; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
-; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
-; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) {
+; SI-LABEL: fneg_fabs_v4f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset1_b32 s7, 31
+; SI-NEXT: s_bitset1_b32 s11, 31
+; SI-NEXT: s_bitset1_b32 s9, 31
+; SI-NEXT: s_bitset1_b32 s5, 31
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v4, s4
+; SI-NEXT: v_mov_b32_e32 v6, s6
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT: v_mov_b32_e32 v5, s5
+; SI-NEXT: v_mov_b32_e32 v7, s7
+; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fabs_v4f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitset1_b32 s7, 31
+; VI-NEXT: s_bitset1_b32 s5, 31
+; VI-NEXT: s_or_b32 s2, s11, 0x80000000
+; VI-NEXT: s_or_b32 s3, s9, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
%fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
store <4 x double> %fsub, ptr addrspace(1) %out
@@ -96,3 +301,5 @@ declare double @fabs(double) readnone
declare double @llvm.fabs.f64(double) readnone
declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
>From eff7b4a3fc3af092bb4963d1d6d4d60827892505 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Tue, 4 Jun 2024 10:00:59 +0530
Subject: [PATCH 7/7] Fixed test CodeGen/AMDGPU/fabs.ll.
---
llvm/test/CodeGen/AMDGPU/fabs-r600.ll | 159 ++++++++++++++++
llvm/test/CodeGen/AMDGPU/fabs.ll | 252 +++++++++++++++++++++-----
2 files changed, 362 insertions(+), 49 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fabs-r600.ll
diff --git a/llvm/test/CodeGen/AMDGPU/fabs-r600.ll b/llvm/test/CodeGen/AMDGPU/fabs-r600.ll
new file mode 100644
index 0000000000000..7e1aa99c3ec40
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fabs-r600.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+
+
+; DAGCombiner will transform:
+; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
+define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
+; R600-LABEL: s_fabsf_fn_free:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: MOV T0.X, |PV.W|,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %bc= bitcast i32 %in to float
+ %fabs = call float @fabsf(float %bc)
+ store float %fabs, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
+; R600-LABEL: s_fabsf_free:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: MOV T0.X, |PV.W|,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %bc= bitcast i32 %in to float
+ %fabs = call float @llvm.fabs.f32(float %bc)
+ store float %fabs, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: s_fabsf_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[2].Z,
+; R600-NEXT: MOV T0.X, |PV.W|,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call float @llvm.fabs.f32(float %in)
+ store float %fabs, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: fabs_v2f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, KC0[3].X,
+; R600-NEXT: MOV T0.Y, |PV.W|,
+; R600-NEXT: MOV * T0.W, KC0[2].W,
+; R600-NEXT: MOV T0.X, |PV.W|,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+ store <2 x float> %fabs, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
+; R600-LABEL: fabsf_v4f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV T0.W, KC0[4].X,
+; R600-NEXT: MOV * T1.W, KC0[3].W,
+; R600-NEXT: MOV * T0.W, |PV.W|,
+; R600-NEXT: MOV T0.Z, |T1.W|,
+; R600-NEXT: MOV * T1.W, KC0[3].Z,
+; R600-NEXT: MOV T0.Y, |PV.W|,
+; R600-NEXT: MOV * T1.W, KC0[3].Y,
+; R600-NEXT: MOV T0.X, |PV.W|,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+ store <4 x float> %fabs, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) {
+; R600-LABEL: fabsf_fn_fold:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MUL_IEEE * T1.X, |KC0[2].Z|, KC0[2].W,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call float @fabsf(float %in0)
+ %fmul = fmul float %fabs, %in1
+ store float %fmul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) {
+; R600-LABEL: fabs_fold:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: MUL_IEEE * T1.X, |KC0[2].Z|, KC0[2].W,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %fabs = call float @llvm.fabs.f32(float %in0)
+ %fmul = fmul float %fabs, %in1
+ store float %fmul, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: bitpreserve_fabsf_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: ADD * T1.X, |KC0[2].Z|, 1.0,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %in.bc = bitcast float %in to i32
+ %int.abs = and i32 %in.bc, 2147483647
+ %bc = bitcast i32 %int.abs to float
+ %fadd = fadd float %bc, 1.0
+ store float %fadd, ptr addrspace(1) %out
+ ret void
+}
+
+declare float @fabsf(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index e18c76f89b6c7..c064886e4c22b 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -1,104 +1,256 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-xnack -enable-misched=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched=0 < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; DAGCombiner will transform:
; (fabsf (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
; unless isFabsFree returns true
-
-; FUNC-LABEL: {{^}}s_fabsf_fn_free:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-
-; GCN: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabsf_fn_free(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fabsf_fn_free:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset0_b32 s4, 31
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fabsf_fn_free:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_bitset0_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
%fabs = call float @fabsf(float %bc)
store float %fabs, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}s_fabsf_free:
-; R600-NOT: AND
-; R600: |PV.{{[XYZW]}}|
-
-; GCN: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabsf_free(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fabsf_free:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fabsf_free:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%bc= bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
store float %fabs, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}s_fabsf_f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-
-; GCN: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabsf_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: s_fabsf_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fabsf_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
store float %fabs, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}fabs_v2f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; SI-LABEL: fabs_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; SI-NEXT: s_and_b32 s1, s2, 0x7fffffff
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fabs_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_bitset0_b32 s3, 31
+; VI-NEXT: s_bitset0_b32 s2, 31
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
store <2 x float> %fabs, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}fabsf_v4f32:
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-
-; GCN: s_bitset0_b32
-; GCN: s_bitset0_b32
-; GCN: s_bitset0_b32
-; GCN: s_bitset0_b32
define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
+; SI-LABEL: fabsf_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bitset0_b32 s3, 31
+; SI-NEXT: s_bitset0_b32 s2, 31
+; SI-NEXT: s_bitset0_b32 s1, 31
+; SI-NEXT: s_bitset0_b32 s0, 31
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_mov_b32_e32 v3, s3
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fabsf_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_bitset0_b32 s3, 31
+; VI-NEXT: s_bitset0_b32 s2, 31
+; VI-NEXT: s_bitset0_b32 s1, 31
+; VI-NEXT: s_bitset0_b32 s0, 31
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
store <4 x float> %fabs, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}fabsf_fn_fold:
-; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9
-; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24
-; GCN-NOT: and
-; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]]
define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, float %in1) {
+; SI-LABEL: fabsf_fn_fold:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_mul_f32_e64 v0, |s2|, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fabsf_fn_fold:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%fabs = call float @fabsf(float %in0)
%fmul = fmul float %fabs, %in1
store float %fmul, ptr addrspace(1) %out
ret void
}
-; FUNC-LABEL: {{^}}fabs_fold:
-; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9
-; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24
-; GCN-NOT: and
-; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]]
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]]
define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %in1) {
+; SI-LABEL: fabs_fold:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_mul_f32_e64 v0, |s2|, v0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fabs_fold:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in0)
%fmul = fmul float %fabs, %in1
store float %fmul, ptr addrspace(1) %out
ret void
}
-; Make sure we turn some integer operations back into fabsf
-; FUNC-LABEL: {{^}}bitpreserve_fabsf_f32:
-; GCN: v_add_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, 1.0
define amdgpu_kernel void @bitpreserve_fabsf_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: bitpreserve_fabsf_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_add_f32_e64 v0, |s0|, 1.0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: bitpreserve_fabsf_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_add_f32_e64 v2, |s0|, 1.0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
%int.abs = and i32 %in.bc, 2147483647
%bc = bitcast i32 %int.abs to float
@@ -111,3 +263,5 @@ declare float @fabsf(float) readnone
declare float @llvm.fabs.f32(float) readnone
declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
More information about the llvm-commits
mailing list