[llvm] 0605e98 - [SDISel][Builder] Fix the instantiation of <1 x bfloat|half> (#94591)

Fri Jun 7 09:47:40 PDT 2024

Author: Quentin Colombet
Date: 2024-06-07T18:47:37+02:00
New Revision: 0605e984fab700a6ef4affc3fdb66aaba3417baa

URL: https://github.com/llvm/llvm-project/commit/0605e984fab700a6ef4affc3fdb66aaba3417baa
DIFF: https://github.com/llvm/llvm-project/commit/0605e984fab700a6ef4affc3fdb66aaba3417baa.diff

LOG: [SDISel][Builder] Fix the instantiation of <1 x bfloat|half> (#94591)

Prior to this change, `SelectionDAGBuilder` was producing `SDNode`s of
the form: `f32 = extract_vector_elt <1 x bfloat|half>, i32 0` when
lowering phis of `<1 x bfloat|half>` and running on a target that
promotes this type to `f32` (like some x86 or AMDGPU targets.)

This construct is invalid since this type of node only allows type
extensions for integer types.
It went unotice because the `extract_vector_elt` node is later broken
down in `bitcast` followed by `bf16_to_fp|fp_extend`. However, when the
argument of the phi is a constant we were crashing because the existing
code would try to constant fold this `extract_vector_elt` into a
any_ext.

This patch fixes this by using a proper decomposition for `<1 x
bfloat|half>`:
```
bfloat|half = bitcast <1 x blfoat|half>
float = fp_extend bfloat|half
```

This change should be NFC for the non-constant-folding cases and fix the
SDISel crashes (reported in
https://github.com/llvm/llvm-project/issues/94449) for the folding
cases.

Note: The change on the arm test is a missing fp16 to f32 constant folding
exposed by this patch. I'll push a separate improvement for that.

Added: 
    llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll
    llvm/test/CodeGen/X86/select-phi-s16-fp.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/test/CodeGen/ARM/arm-half-promote.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2f3626f1c820a..be5e0f6ef058b 100644

--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -729,8 +729,17 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       // prevents it from being picked up by the earlier bitcast case.
       if (ValueVT.getVectorElementCount().isScalar() &&
           (!ValueVT.isFloatingPoint() || !PartVT.isInteger())) {
-        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
-                          DAG.getVectorIdxConstant(0, DL));
+        // If we reach this condition and PartVT is FP, this means that
+        // ValueVT is also FP and both have a 
diff erent size, otherwise we
+        // would have bitcasted them. Producing an EXTRACT_VECTOR_ELT here
+        // would be invalid since that would mean the smaller FP type has to
+        // be extended to the larger one.
+        if (PartVT.isFloatingPoint()) {
+          Val = DAG.getBitcast(ValueVT.getScalarType(), Val);
+          Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
+        } else
+          Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+                            DAG.getVectorIdxConstant(0, DL));
       } else {
         uint64_t ValueSize = ValueVT.getFixedSizeInBits();
         assert(PartVT.getFixedSizeInBits() > ValueSize &&

diff  --git a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll
new file mode 100644
index 0000000000000..0d6e987165d87
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=hawaii %s -o - | FileCheck %s
+
+; For all these tests we disable optimizations through function attributes
+; because the code we are exercising here needs phis and we want to keep the
+; IR small.
+
+; This code used to crash in SDISel because f16 was promoted to f32 through
+; a `f32 = vector_extract_elt <1 x f16>, i32 0`, which is illegal.
+; The invalid SDNode and thus, the crash was only exposed by the constant
+; folding.
+define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
+; CHECK-LABEL: phi_vec1half_to_f32_with_const_folding:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_cvt_f32_f16_e64 v2, s4
+; CHECK-NEXT:  ; %bb.1: ; %bb
+; CHECK-NEXT:    v_cvt_f16_f32_e64 v2, v2
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
+; CHECK-NEXT:    v_cvt_f16_f32_e64 v2, s4
+; CHECK-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %bb
+
+bb:
+  %phi = phi <1 x half> [ zeroinitializer, %entry ]
+  %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x half> %res, ptr addrspace(1) %dst
+  ret void
+}
+
+; Same as phi_vec1half_to_f32_with_const_folding but without the folding.
+; This test exercises the same invalid SDNode, but it happened to work by
+; accident before. Here we make sure the fix also work as expected in the
+; non-constant folding case.
+define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 {
+; CHECK-LABEL: phi_vec1half_to_f32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cvt_f32_f16_e64 v0, v0
+; CHECK-NEXT:  ; %bb.1: ; %bb
+; CHECK-NEXT:    v_cvt_f16_f32_e64 v0, v0
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
+; CHECK-NEXT:    v_cvt_f16_f32_e64 v0, s4
+; CHECK-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %input = load <1 x half>, ptr addrspace(1) %src
+  br label %bb
+
+bb:
+  %phi = phi <1 x half> [ %input, %entry ]
+  %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x half> %res, ptr addrspace(1) %dst
+  ret void
+}
+
+; Same as phi_vec1bf16_to_f32 but with bfloat instead of half.
+define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 {
+; CHECK-LABEL: phi_vec1bf16_to_f32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; CHECK-NEXT:    s_mov_b32 s4, 16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
+; CHECK-NEXT:  ; %bb.1: ; %bb
+; CHECK-NEXT:    v_mul_f32_e64 v0, 1.0, v0
+; CHECK-NEXT:    s_mov_b32 s4, 16
+; CHECK-NEXT:    v_lshrrev_b32_e64 v0, s4, v0
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %input = load <1 x bfloat>, ptr addrspace(1) %src
+  br label %bb
+
+bb:
+  %phi = phi <1 x bfloat> [ %input, %entry ]
+  %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x bfloat> %res, ptr addrspace(1) %dst
+  ret void
+}
+
+; Same as phi_vec1half_to_f32_with_const_folding but with bfloat instead of half.
+define void @phi_vec1bf16_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
+; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:  ; %bb.1: ; %bb
+; CHECK-NEXT:    v_mul_f32_e64 v2, 1.0, s4
+; CHECK-NEXT:    s_mov_b32 s4, 16
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, s4, v2
+; CHECK-NEXT:    s_mov_b32 s7, 0xf000
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    s_mov_b32 s5, s6
+; CHECK-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %bb
+
+bb:
+  %phi = phi <1 x bfloat> [ zeroinitializer, %entry ]
+  %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x bfloat> %res, ptr addrspace(1) %dst
+  ret void
+}
+
+attributes #0 = { noinline optnone }

diff  --git a/llvm/test/CodeGen/ARM/arm-half-promote.ll b/llvm/test/CodeGen/ARM/arm-half-promote.ll
index e1ab75b2ac7f1..a5fafd4238616 100644
--- a/llvm/test/CodeGen/ARM/arm-half-promote.ll
+++ b/llvm/test/CodeGen/ARM/arm-half-promote.ll
@@ -116,7 +116,9 @@ define fastcc { <8 x half>, <8 x half> } @f3() {
 
 define void @extract_insert(ptr %dst) optnone noinline {
 ; CHECK-LABEL: extract_insert:
-; CHECK: vmov.i32 d0, #0x0
+; CHECK: movs r1, #0
+; CHECK: vmov s0, r1
+; CHECK: vcvtb.f32.f16 s0, s0
 ; CHECK: vcvtb.f16.f32 s0, s0
 ; CHECK: vmov r1, s0
 ; CHECK: strh r1, [r0]

diff  --git a/llvm/test/CodeGen/X86/select-phi-s16-fp.ll b/llvm/test/CodeGen/X86/select-phi-s16-fp.ll
new file mode 100644
index 0000000000000..083f418b6752e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/select-phi-s16-fp.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mcpu=generic -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s
+
+; For all these tests we disable optimizations through function attributes
+; because the code we are exercising here needs phis and we want to keep the
+; IR small.
+
+; This code used to crash in SDISel because bf16 was promoted to f32 through
+; a `f32 = vector_extract_elt <1 x bf16>, i32 0`, which is illegal.
+; The invalid SDNode and thus, the crash was only exposed by the constant
+; folding.
+define void @phi_vec1bf16_to_f32_with_const_folding(ptr %dst) #0 {
+; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    jmp LBB0_1
+; CHECK-NEXT:  LBB0_1: ## %bb
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    callq ___truncsfbf2
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movw %ax, 2(%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  br label %bb
+
+bb:
+  %phi = phi <1 x bfloat> [ zeroinitializer, %entry ]
+  %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x bfloat> %res, ptr %dst
+  ret void
+}
+; Same as phi_vec1bf16_to_f32_with_const_folding but without the constant
+; folding.
+; This test exercises the same invalid SDNode, but it happened to work by
+; accident before. Here we make sure the fix also work as expected in the
+; non-constant folding case.
+define void @phi_vec1bf16_to_f32(ptr %src, ptr %dst) #0 {
+; CHECK-LABEL: phi_vec1bf16_to_f32:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    jmp LBB1_1
+; CHECK-NEXT:  LBB1_1: ## %bb
+; CHECK-NEXT:    callq ___truncsfbf2
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movw %ax, 2(%rbx)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %input = load <1 x bfloat>, ptr %src
+  br label %bb
+
+bb:
+  %phi = phi <1 x bfloat> [ %input, %entry ]
+  %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x bfloat> %res, ptr %dst
+  ret void
+}
+
+
+; Half type is legal on x86 so nothing special here, but it
+; doesn't hurt to be thorough.
+define void @phi_vec1half_with_const_folding(ptr %dst) #0 {
+; CHECK-LABEL: phi_vec1half_with_const_folding:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    jmp LBB2_1
+; CHECK-NEXT:  LBB2_1: ## %bb
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movw %ax, 2(%rdi)
+; CHECK-NEXT:    retq
+entry:
+  br label %bb
+
+bb:
+  %phi = phi <1 x half> [ zeroinitializer, %entry ]
+  %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x half> %res, ptr %dst
+  ret void
+}
+
+; Half type is legal on x86 so nothing special here, but it
+; doesn't hurt to be thorough.
+; Same as phi_vec1half_with_constant_folding but without the constant folding.
+define void @phi_vec1half(ptr %src, ptr %dst) #0 {
+; CHECK-LABEL: phi_vec1half:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pinsrw $0, (%rdi), %xmm0
+; CHECK-NEXT:    jmp LBB3_1
+; CHECK-NEXT:  LBB3_1: ## %bb
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movw %ax, 2(%rsi)
+; CHECK-NEXT:    retq
+entry:
+  %input = load <1 x half>, ptr %src
+  br label %bb
+
+bb:
+  %phi = phi <1 x half> [ %input, %entry ]
+  %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> <i32 0, i32 1>
+  store <2 x half> %res, ptr %dst
+  ret void
+}
+
+attributes #0 = { noinline optnone }