[llvm] f24de95 - AMDGPU: Add baseline tests for known bits handling of med3
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 10 07:59:05 PDT 2023
Author: Matt Arsenault
Date: 2023-06-10T10:58:39-04:00
New Revision: f24de950e51be3dddd33504c1c34c6c9ccd22b2a
URL: https://github.com/llvm/llvm-project/commit/f24de950e51be3dddd33504c1c34c6c9ccd22b2a
DIFF: https://github.com/llvm/llvm-project/commit/f24de950e51be3dddd33504c1c34c6c9ccd22b2a.diff
LOG: AMDGPU: Add baseline tests for known bits handling of med3
Added:
llvm/test/CodeGen/AMDGPU/GlobalISel/compute-num-sign-bits-med3.mir
llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/compute-num-sign-bits-med3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/compute-num-sign-bits-med3.mir
new file mode 100644
index 0000000000000..7daf638ec1078
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/compute-num-sign-bits-med3.mir
@@ -0,0 +1,170 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: known_sign_bits_smed3_0
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: known_sign_bits_smed3_0
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %val:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %val0:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val1:_(s32) = G_CONSTANT i32 -255
+ ; CHECK-NEXT: %val2:_(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %smed3, 9
+ ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+ %val:_(s32) = COPY $vgpr0
+ %val0:_(s32) = G_SEXT_INREG %val, 8
+ %val1:_(s32) = G_CONSTANT i32 -255
+ %val2:_(s32) = G_CONSTANT i32 255
+ %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ %inreg:_(s32) = G_SEXT_INREG %smed3, 9
+ $vgpr0 = COPY %inreg
+
+...
+
+---
+name: known_sign_bits_smed3_1
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: known_sign_bits_smed3_1
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %val:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %val0:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val1:_(s32) = G_CONSTANT i32 -255
+ ; CHECK-NEXT: %val2:_(s32) = G_CONSTANT i32 255
+ ; CHECK-NEXT: %smed3:_(s32) = G_AMDGPU_SMED3 %val1, %val0, %val2
+ ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %smed3, 9
+ ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+ %val:_(s32) = COPY $vgpr0
+ %val0:_(s32) = G_SEXT_INREG %val, 8
+ %val1:_(s32) = G_CONSTANT i32 -255
+ %val2:_(s32) = G_CONSTANT i32 255
+ %smed3:_(s32) = G_AMDGPU_SMED3 %val1, %val0, %val2
+ %inreg:_(s32) = G_SEXT_INREG %smed3, 9
+ $vgpr0 = COPY %inreg
+
+...
+
+---
+name: known_sign_bits_smed3_2
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: known_sign_bits_smed3_2
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %val:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %val0:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val1:_(s32) = G_CONSTANT i32 -256
+ ; CHECK-NEXT: %val2:_(s32) = G_CONSTANT i32 128
+ ; CHECK-NEXT: %smed3:_(s32) = G_AMDGPU_SMED3 %val1, %val2, %val0
+ ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %smed3, 9
+ ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+ %val:_(s32) = COPY $vgpr0
+ %val0:_(s32) = G_SEXT_INREG %val, 8
+ %val1:_(s32) = G_CONSTANT i32 -256
+ %val2:_(s32) = G_CONSTANT i32 128
+ %smed3:_(s32) = G_AMDGPU_SMED3 %val1, %val2, %val0
+ %inreg:_(s32) = G_SEXT_INREG %smed3, 9
+ $vgpr0 = COPY %inreg
+
+...
+
+---
+name: not_enough_sign_bits_smed3_0
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: not_enough_sign_bits_smed3_0
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %val:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %val0:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val1:_(s32) = G_SEXT_INREG %val, 9
+ ; CHECK-NEXT: %val2:_(s32) = G_SEXT_INREG %val, 9
+ ; CHECK-NEXT: %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %smed3, 8
+ ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+ %val:_(s32) = COPY $vgpr0
+ %val0:_(s32) = G_SEXT_INREG %val, 8
+ %val1:_(s32) = G_SEXT_INREG %val, 9
+ %val2:_(s32) = G_SEXT_INREG %val, 9
+ %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ %inreg:_(s32) = G_SEXT_INREG %smed3, 8
+ $vgpr0 = COPY %inreg
+
+...
+
+---
+name: not_enough_sign_bits_smed3_1
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: not_enough_sign_bits_smed3_1
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %val:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %val0:_(s32) = G_SEXT_INREG %val, 9
+ ; CHECK-NEXT: %val1:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val2:_(s32) = G_SEXT_INREG %val, 9
+ ; CHECK-NEXT: %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %smed3, 8
+ ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+ %val:_(s32) = COPY $vgpr0
+ %val0:_(s32) = G_SEXT_INREG %val, 9
+ %val1:_(s32) = G_SEXT_INREG %val, 8
+ %val2:_(s32) = G_SEXT_INREG %val, 9
+ %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ %inreg:_(s32) = G_SEXT_INREG %smed3, 8
+ $vgpr0 = COPY %inreg
+
+...
+
+---
+name: not_enough_sign_bits_smed3_2
+legalized: true
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: not_enough_sign_bits_smed3_2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: %val:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: %val0:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val1:_(s32) = G_SEXT_INREG %val, 8
+ ; CHECK-NEXT: %val2:_(s32) = G_SEXT_INREG %val, 9
+ ; CHECK-NEXT: %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ ; CHECK-NEXT: %inreg:_(s32) = G_SEXT_INREG %smed3, 8
+ ; CHECK-NEXT: $vgpr0 = COPY %inreg(s32)
+ %val:_(s32) = COPY $vgpr0
+ %val0:_(s32) = G_SEXT_INREG %val, 8
+ %val1:_(s32) = G_SEXT_INREG %val, 8
+ %val2:_(s32) = G_SEXT_INREG %val, 9
+ %smed3:_(s32) = G_AMDGPU_SMED3 %val0, %val1, %val2
+ %inreg:_(s32) = G_SEXT_INREG %smed3, 8
+ $vgpr0 = COPY %inreg
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
new file mode 100644
index 0000000000000..d6343ce7bca63
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -amdgpu-codegenprepare-mul24=0 -amdgpu-codegenprepare-disable-idiv-expansion < %s | FileCheck -check-prefixes=SI,SI-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -amdgpu-codegenprepare-mul24=0 -amdgpu-codegenprepare-disable-idiv-expansion < %s | FileCheck -check-prefixes=SI,SI-GISEL %s
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+
+; Test computeKnownBits for umed3 node. We know the base address has a
+; 0 sign bit only after umed3 is formed. The DS instruction offset can
+; only be folded on SI with a positive base address.
+define i32 @v_known_bits_umed3(i8 %a) {
+; SI-SDAG-LABEL: v_known_bits_umed3:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x80
+; SI-SDAG-NEXT: v_med3_u32 v0, v0, 32, v1
+; SI-SDAG-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
+; SI-SDAG-NEXT: s_mov_b32 m0, -1
+; SI-SDAG-NEXT: ds_read_u8 v0, v0
+; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_known_bits_umed3:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x80
+; SI-GISEL-NEXT: v_med3_u32 v0, v0, 32, v1
+; SI-GISEL-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
+; SI-GISEL-NEXT: ds_read_u8 v0, v0
+; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ext.a = zext i8 %a to i32
+ %max.a = call i32 @llvm.umax.i32(i32 %ext.a, i32 32)
+ %umed3 = call i32 @llvm.umin.i32(i32 %max.a, i32 128)
+ %cast.umed3 = inttoptr i32 %umed3 to ptr addrspace(3)
+ %gep = getelementptr i8, ptr addrspace(3) %cast.umed3, i32 128
+ %load = load i8, ptr addrspace(3) %gep
+ %result = zext i8 %load to i32
+ ret i32 %result
+}
+
+; The IR expansion of division is disabled. The division is legalized
+; late, after the formation of smed3. We need to be able to
+; computeNumSignBits on the smed3 in order to use the 24-bit-as-float
+; sdiv legalization.
+define i32 @v_known_signbits_smed3(i16 %a, i16 %b) {
+; SI-SDAG-LABEL: v_known_signbits_smed3:
+; SI-SDAG: ; %bb.0:
+; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SI-SDAG-NEXT: s_movk_i32 s4, 0xffc0
+; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0x80
+; SI-SDAG-NEXT: v_med3_i32 v1, v1, s4, v2
+; SI-SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SI-SDAG-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; SI-SDAG-NEXT: v_xor_b32_e32 v1, v1, v2
+; SI-SDAG-NEXT: v_cvt_f32_u32_e32 v3, v1
+; SI-SDAG-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; SI-SDAG-NEXT: s_movk_i32 s4, 0xffe0
+; SI-SDAG-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; SI-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-SDAG-NEXT: v_med3_i32 v0, v0, s4, 64
+; SI-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; SI-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; SI-SDAG-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; SI-SDAG-NEXT: v_xor_b32_e32 v0, v0, v5
+; SI-SDAG-NEXT: v_mul_lo_u32 v4, v4, v3
+; SI-SDAG-NEXT: v_mul_hi_u32 v4, v3, v4
+; SI-SDAG-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; SI-SDAG-NEXT: v_mul_hi_u32 v3, v0, v3
+; SI-SDAG-NEXT: v_mul_lo_u32 v4, v3, v1
+; SI-SDAG-NEXT: v_add_i32_e32 v6, vcc, 1, v3
+; SI-SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; SI-SDAG-NEXT: v_sub_i32_e32 v4, vcc, v0, v1
+; SI-SDAG-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-SDAG-NEXT: v_add_i32_e32 v4, vcc, 1, v3
+; SI-SDAG-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; SI-SDAG-NEXT: v_xor_b32_e32 v1, v5, v2
+; SI-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-GISEL-LABEL: v_known_signbits_smed3:
+; SI-GISEL: ; %bb.0:
+; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffc0
+; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x80
+; SI-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3
+; SI-GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SI-GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; SI-GISEL-NEXT: v_xor_b32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
+; SI-GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v1
+; SI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffffe0
+; SI-GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-GISEL-NEXT: v_med3_i32 v0, v0, v4, 64
+; SI-GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; SI-GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
+; SI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; SI-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; SI-GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
+; SI-GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; SI-GISEL-NEXT: v_mul_hi_u32 v3, v0, v3
+; SI-GISEL-NEXT: v_mul_lo_u32 v5, v3, v1
+; SI-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v3
+; SI-GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; SI-GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; SI-GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; SI-GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; SI-GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc
+; SI-GISEL-NEXT: v_xor_b32_e32 v1, v4, v2
+; SI-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ext.a = sext i16 %a to i32
+ %max.a = call i32 @llvm.smax.i32(i32 %ext.a, i32 -32)
+ %smed3.a = call i32 @llvm.smin.i32(i32 %max.a, i32 64)
+ %ext.b = sext i16 %b to i32
+ %max.b = call i32 @llvm.smax.i32(i32 %ext.b, i32 -64)
+ %smed3.b = call i32 @llvm.smin.i32(i32 %max.b, i32 128)
+ %mul = sdiv i32 %smed3.a, %smed3.b
+ ret i32 %mul
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SI: {{.*}}
More information about the llvm-commits
mailing list