[llvm] db777ea - AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 23 07:00:42 PDT 2020
Author: Matt Arsenault
Date: 2020-06-23T10:00:35-04:00
New Revision: db777eaea3a7bfd6c02a3f8d5e5a42bd90943690
URL: https://github.com/llvm/llvm-project/commit/db777eaea3a7bfd6c02a3f8d5e5a42bd90943690
DIFF: https://github.com/llvm/llvm-project/commit/db777eaea3a7bfd6c02a3f8d5e5a42bd90943690.diff
LOG: AMDGPU/GlobalISel: Fix asserts on non-s32 sitofp/uitofp sources
The combine to form cvt_f32_ubyte0 was assuming the source type was
always 32-bit, but this needs to tolerate any legal source type.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 29203eb7df22..098b0e993886 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -137,9 +137,11 @@ static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
// about in practice.
LLT Ty = MRI.getType(DstReg);
if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
- const APInt Mask = APInt::getHighBitsSet(32, 24);
- return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(),
- Mask);
+ Register SrcReg = MI.getOperand(1).getReg();
+ unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
+ assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
+ const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
+ return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
}
return false;
@@ -151,14 +153,18 @@ static void applyUCharToFloat(MachineInstr &MI) {
const LLT S32 = LLT::scalar(32);
Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
LLT Ty = B.getMRI()->getType(DstReg);
+ LLT SrcTy = B.getMRI()->getType(SrcReg);
+ if (SrcTy != S32)
+ SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
if (Ty == S32) {
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
- {MI.getOperand(1)}, MI.getFlags());
+ {SrcReg}, MI.getFlags());
} else {
auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
- {MI.getOperand(1)}, MI.getFlags());
+ {SrcReg}, MI.getFlags());
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
index 2021107b8448..ad21521c389f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
@@ -173,3 +173,95 @@ body: |
%4:_(s32) = G_ANYEXT %3
$vgpr0 = COPY %4
...
+
+---
+name: uitofp_s64_char_to_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: uitofp_s64_char_to_f32
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+ ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+ ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+ ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_CONSTANT i64 255
+ %2:_(s64) = G_AND %0, %1
+ %3:_(s32) = G_UITOFP %2
+ $vgpr0 = COPY %3
+...
+
+---
+name: sitofp_s64_char_to_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+
+ ; CHECK-LABEL: name: sitofp_s64_char_to_f32
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+ ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+ ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]]
+ ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
+ %0:_(s64) = COPY $vgpr0_vgpr1
+ %1:_(s64) = G_CONSTANT i64 255
+ %2:_(s64) = G_AND %0, %1
+ %3:_(s32) = G_SITOFP %2
+ $vgpr0 = COPY %3
+...
+
+---
+name: uitofp_s16_char_to_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: uitofp_s16_char_to_f32
+ ; CHECK: liveins: $vgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+ ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16)
+ ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[ANYEXT]]
+ ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s16) = G_TRUNC %0
+ %2:_(s16) = G_CONSTANT i16 255
+ %3:_(s16) = G_AND %1, %2
+ %4:_(s32) = G_UITOFP %3
+ $vgpr0 = COPY %4
+...
+
+---
+name: sitofp_s16_char_to_f32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: sitofp_s16_char_to_f32
+ ; CHECK: liveins: $vgpr0
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+ ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s16)
+ ; CHECK: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[ANYEXT]]
+ ; CHECK: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32)
+ %0:_(s32) = COPY $vgpr0
+ %1:_(s16) = G_TRUNC %0
+ %2:_(s16) = G_CONSTANT i16 255
+ %3:_(s16) = G_AND %1, %2
+ %4:_(s32) = G_SITOFP %3
+ $vgpr0 = COPY %4
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 35d2df4894e7..493f6fff7358 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1142,3 +1142,187 @@ bb:
store float %add, float addrspace(1)* %out
ret void
}
+
+define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
+; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s6, 0xff
+; SI-NEXT: v_and_b32_e32 v2, s6, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0, v2
+; SI-NEXT: v_ffbh_u32_e32 v4, v2
+; SI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
+; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v4
+; SI-NEXT: v_ffbh_u32_e32 v5, v3
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v5, 0xbe
+; SI-NEXT: v_sub_i32_e32 v6, vcc, v5, v4
+; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], v4
+; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5
+; SI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; SI-NEXT: v_and_b32_e32 v5, s6, v3
+; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; SI-NEXT: v_lshlrev_b32_e32 v2, 23, v2
+; SI-NEXT: s_mov_b32 s4, 0
+; SI-NEXT: s_movk_i32 s5, 0x80
+; SI-NEXT: v_or_b32_e32 v2, v2, v3
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
+; SI-NEXT: v_and_b32_e32 v3, 1, v2
+; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc
+; SI-NEXT: v_mov_b32_e32 v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_movk_i32 s6, 0xff
+; VI-NEXT: v_and_b32_e32 v2, s6, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 0, v2
+; VI-NEXT: v_ffbh_u32_e32 v4, v2
+; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4
+; VI-NEXT: v_ffbh_u32_e32 v5, v3
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT: v_mov_b32_e32 v5, 0xbe
+; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4
+; VI-NEXT: v_lshlrev_b64 v[4:5], v4, v[2:3]
+; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v5
+; VI-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; VI-NEXT: v_and_b32_e32 v5, s6, v3
+; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
+; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_movk_i32 s5, 0x80
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
+; VI-NEXT: v_and_b32_e32 v3, 1, v2
+; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: v_cndmask_b32_e64 v3, v3, 1, vcc
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3
+; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, -v2, vcc
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %masked = and i64 %arg0, 255
+ %itofp = sitofp i64 %masked to float
+ ret float %itofp
+}
+
+define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
+; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xff
+; SI-NEXT: v_and_b32_e32 v0, s4, v0
+; SI-NEXT: v_ffbh_u32_e32 v2, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
+; SI-NEXT: v_ffbh_u32_e32 v3, 0
+; SI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0
+; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SI-NEXT: v_mov_b32_e32 v3, 0xbe
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_sub_i32_e32 v4, vcc, v3, v2
+; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
+; SI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT: v_and_b32_e32 v3, s4, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; SI-NEXT: s_mov_b32 s4, 0
+; SI-NEXT: s_movk_i32 s5, 0x80
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT: v_and_b32_e32 v1, 1, v0
+; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; SI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_movk_i32 s4, 0xff
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
+; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
+; VI-NEXT: v_ffbh_u32_e32 v3, 0
+; VI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 0xbe
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_sub_u32_e32 v4, vcc, v3, v2
+; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1]
+; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3
+; VI-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT: v_and_b32_e32 v3, s4, v1
+; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: s_movk_i32 s5, 0x80
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
+; VI-NEXT: v_and_b32_e32 v1, 1, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v1, v1, 1, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %masked = and i64 %arg0, 255
+ %itofp = uitofp i64 %masked to float
+ ret float %itofp
+}
+
+define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
+; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %masked = and i16 %arg0, 255
+ %itofp = sitofp i16 %masked to float
+ ret float %itofp
+}
+
+define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
+; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_bfe_u32 v0, v0, 0, 16
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %masked = and i16 %arg0, 255
+ %itofp = uitofp i16 %masked to float
+ ret float %itofp
+}
More information about the llvm-commits
mailing list