[llvm] [AArch64] Switch to soft promoting half types. (PR #80576)

Sat Feb 3 18:47:48 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Harald van Dijk (hvdijk)

<details>
<summary>Changes</summary>

The traditional promotion is known to generate wrong code.

Like #80440 for ARM, except that far less is affected as on AArch64, hardware floating point support always includes FP16 support and is unaffected by these changes. This only affects `-mgeneral-regs-only` (Clang) / `-mattr=-fp-armv8` (LLVM).

---
Full diff: https://github.com/llvm/llvm-project/pull/80576.diff


2 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+4) 
- (modified) llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll (+24-116) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 436b21fd13463..160eafab8c364 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1308,6 +1308,10 @@ class AArch64TargetLowering : public TargetLowering {
   bool preferScalarizeSplat(SDNode *N) const override;
 
   unsigned getMinimumJumpTableEntries() const override;
+
+  bool softPromoteHalfType() const override { return true; }
+
+  bool useFPRegsForHalfType() const override { return true; }
 };
 
 namespace AArch64 {
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
index 37186cf22ccc7..a34f7abcc22a3 100644
--- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -70,22 +70,20 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
 ; NOFP16-NEXT:    .cfi_offset w22, -32
 ; NOFP16-NEXT:    .cfi_offset w30, -48
 ; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w2, #0xffff
+; NOFP16-NEXT:    and w0, w1, #0xffff
 ; NOFP16-NEXT:    mov x19, x3
-; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    mov w20, w2
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
 ; NOFP16-NEXT:    mov w22, w0
 ; NOFP16-NEXT:    and w0, w21, #0xffff
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w8, w0
 ; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    orr x21, x8, x22, lsl #32
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w8, w21
-; NOFP16-NEXT:    // kill: def $w0 killed $w0 def $x0
-; NOFP16-NEXT:    str w22, [x19, #8]
-; NOFP16-NEXT:    orr x8, x8, x0, lsl #32
+; NOFP16-NEXT:    str x21, [x19]
 ; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    str x8, [x19]
+; NOFP16-NEXT:    str w0, [x19, #8]
 ; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; NOFP16-NEXT:    ret
@@ -182,46 +180,17 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
 define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v4f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v4f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w20, w1
-; NOFP16-NEXT:    mov w21, w2
-; NOFP16-NEXT:    mov w22, w3
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w20, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <4 x half> @v4f16_result()
   store <4 x half> %val, ptr %ptr
@@ -231,82 +200,21 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v8f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x27, [sp, #-80]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w24, -48
-; NOFP16-NEXT:    .cfi_offset w25, -56
-; NOFP16-NEXT:    .cfi_offset w26, -64
-; NOFP16-NEXT:    .cfi_offset w27, -72
-; NOFP16-NEXT:    .cfi_offset w30, -80
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v8f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w21, w1
-; NOFP16-NEXT:    mov w22, w2
-; NOFP16-NEXT:    mov w23, w3
-; NOFP16-NEXT:    mov w24, w4
-; NOFP16-NEXT:    mov w25, w5
-; NOFP16-NEXT:    mov w26, w6
-; NOFP16-NEXT:    mov w27, w7
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w22, w0
-; NOFP16-NEXT:    and w0, w23, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w24, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w24, w0
-; NOFP16-NEXT:    and w0, w25, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w25, w0
-; NOFP16-NEXT:    and w0, w26, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w26, w0
-; NOFP16-NEXT:    and w0, w27, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #14]
-; NOFP16-NEXT:    mov w0, w26
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #12]
-; NOFP16-NEXT:    mov w0, w25
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #10]
-; NOFP16-NEXT:    mov w0, w24
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #8]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w22
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w5, [x19, #10]
+; NOFP16-NEXT:    strh w7, [x19, #14]
+; NOFP16-NEXT:    strh w6, [x19, #12]
+; NOFP16-NEXT:    strh w4, [x19, #8]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x27, [sp], #80 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <8 x half> @v8f16_result()
   store <8 x half> %val, ptr %ptr

``````````

</details>


https://github.com/llvm/llvm-project/pull/80576