[llvm] [DAG] Reducing instructions by better legalization handling of i128 data types (PR #99913)
Julius Alexandre via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 22 11:28:05 PDT 2024
https://github.com/medievalghoul created https://github.com/llvm/llvm-project/pull/99913
**Issue:** https://github.com/rust-lang/rust/issues/124790
**Previous PR:** https://github.com/llvm/llvm-project/pull/99614
https://rust.godbolt.org/z/T7eKP3Tvo
**Aarch64:** https://alive2.llvm.org/ce/z/dqr2Kg
**x86:** https://alive2.llvm.org/ce/z/ze88Hw
cc: @RKSimon @topperc
>From b3078d1bfc6aa1c92c0c85d9352c6fa2896b1adf Mon Sep 17 00:00:00 2001
From: medievalghoul <61852278+medievalghoul at users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:10:42 -0400
Subject: [PATCH 1/3] Uploaded test without optimization
---
llvm/test/CodeGen/AArch64/avg-i128.ll | 136 +++++++++++++++++++
llvm/test/CodeGen/X86/avg-i128.ll | 184 ++++++++++++++++++++++++++
2 files changed, 320 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/avg-i128.ll
create mode 100644 llvm/test/CodeGen/X86/avg-i128.ll
diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avg-i128.ll
new file mode 100644
index 0000000000000..d1d2cd7dd0c89
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/avg-i128.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define i128 @avgflooru_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: eor x8, x3, x1
+; CHECK-NEXT: eor x9, x2, x0
+; CHECK-NEXT: and x10, x2, x0
+; CHECK-NEXT: extr x9, x8, x9, #1
+; CHECK-NEXT: lsr x8, x8, #1
+; CHECK-NEXT: and x11, x3, x1
+; CHECK-NEXT: adds x0, x10, x9
+; CHECK-NEXT: adc x1, x11, x8
+; CHECK-NEXT: ret
+start:
+ %xor = xor i128 %y, %x
+ %lshr = lshr i128 %xor, 1
+ %and = and i128 %y, %x
+ %add = add i128 %lshr, %and
+ ret i128 %add
+}
+
+declare void @use(i8)
+
+define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_multi_use:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w21, -24
+; CHECK-NEXT: .cfi_offset w22, -32
+; CHECK-NEXT: .cfi_offset w23, -40
+; CHECK-NEXT: .cfi_offset w24, -48
+; CHECK-NEXT: .cfi_offset w30, -64
+; CHECK-NEXT: eor x23, x3, x1
+; CHECK-NEXT: eor x24, x2, x0
+; CHECK-NEXT: mov x21, x1
+; CHECK-NEXT: mov x22, x0
+; CHECK-NEXT: mov x0, x24
+; CHECK-NEXT: mov x1, x23
+; CHECK-NEXT: mov x19, x3
+; CHECK-NEXT: mov x20, x2
+; CHECK-NEXT: bl use
+; CHECK-NEXT: extr x24, x23, x24, #1
+; CHECK-NEXT: lsr x23, x23, #1
+; CHECK-NEXT: mov x0, x24
+; CHECK-NEXT: mov x1, x23
+; CHECK-NEXT: bl use
+; CHECK-NEXT: and x8, x20, x22
+; CHECK-NEXT: and x9, x19, x21
+; CHECK-NEXT: adds x0, x8, x24
+; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: adc x1, x9, x23
+; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+start:
+ %xor = xor i128 %y, %x
+ call void @use(i128 %xor)
+ %lshr = lshr i128 %xor, 1
+ call void @use(i128 %lshr)
+ %and = and i128 %y, %x
+ %add = add i128 %lshr, %and
+ ret i128 %add
+}
+
+define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_negative:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: and x9, x2, x0
+; CHECK-NEXT: mvn x10, x1
+; CHECK-NEXT: and x11, x3, x1
+; CHECK-NEXT: adds x0, x8, x9
+; CHECK-NEXT: adc x1, x10, x11
+; CHECK-NEXT: ret
+start:
+ %xor = xor i128 %x, -1
+ %and = and i128 %y, %x
+ %add = add i128 %xor, %and
+ ret i128 %add
+}
+
+define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
+; CHECK-LABEL: avgflooru_i128_negative2:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: mov w8, w1
+; CHECK-NEXT: add x8, x8, w0, uxtw
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT: ret
+start:
+ %xor = xor i32 %y, %x
+ %lshr = lshr i32 %xor, 1
+ %and = and i32 %y, %x
+ %add = add i32 %lshr, %and
+ ret i32 %add
+}
+
+define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
+; CHECK-LABEL: avgflooru_i128_vec:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: eor x8, x4, x0
+; CHECK-NEXT: eor x9, x5, x1
+; CHECK-NEXT: eor x11, x6, x2
+; CHECK-NEXT: extr x8, x9, x8, #1
+; CHECK-NEXT: eor x12, x7, x3
+; CHECK-NEXT: and x13, x4, x0
+; CHECK-NEXT: lsr x9, x9, #1
+; CHECK-NEXT: extr x11, x12, x11, #1
+; CHECK-NEXT: and x10, x5, x1
+; CHECK-NEXT: adds x0, x13, x8
+; CHECK-NEXT: lsr x8, x12, #1
+; CHECK-NEXT: and x12, x6, x2
+; CHECK-NEXT: adc x1, x10, x9
+; CHECK-NEXT: adds x10, x12, x11
+; CHECK-NEXT: and x9, x7, x3
+; CHECK-NEXT: fmov d0, x10
+; CHECK-NEXT: adc x3, x9, x8
+; CHECK-NEXT: mov v0.d[1], x3
+; CHECK-NEXT: fmov x2, d0
+; CHECK-NEXT: ret
+start:
+ %xor = xor <2 x i128> %y, %x
+ %lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
+ %and = and <2 x i128> %y, %x
+ %add = add <2 x i128> %lshr, %and
+ ret <2 x i128> %add
+}
diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avg-i128.ll
new file mode 100644
index 0000000000000..82827720865dc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avg-i128.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
+
+define i128 @avgflooru_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: xorq %rdi, %rax
+; CHECK-NEXT: movq %rcx, %r8
+; CHECK-NEXT: xorq %rsi, %r8
+; CHECK-NEXT: shrdq $1, %r8, %rax
+; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: shrq %r8
+; CHECK-NEXT: andq %rdi, %rdx
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: adcq %rcx, %r8
+; CHECK-NEXT: movq %r8, %rdx
+; CHECK-NEXT: retq
+start:
+ %xor = xor i128 %y, %x
+ %lshr = lshr i128 %xor, 1
+ %and = and i128 %y, %x
+ %add = add i128 %lshr, %and
+ ret i128 %add
+}
+
+declare void @use(i8)
+
+define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_multi_use:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rcx, %rbx
+; CHECK-NEXT: movq %rdx, %r14
+; CHECK-NEXT: movq %rsi, %r15
+; CHECK-NEXT: movq %rdi, %r12
+; CHECK-NEXT: movq %rdx, %r13
+; CHECK-NEXT: xorq %rdi, %r13
+; CHECK-NEXT: movq %rcx, %rbp
+; CHECK-NEXT: xorq %rsi, %rbp
+; CHECK-NEXT: movq %r13, %rdi
+; CHECK-NEXT: movq %rbp, %rsi
+; CHECK-NEXT: callq use at PLT
+; CHECK-NEXT: shrdq $1, %rbp, %r13
+; CHECK-NEXT: shrq %rbp
+; CHECK-NEXT: movq %r13, %rdi
+; CHECK-NEXT: movq %rbp, %rsi
+; CHECK-NEXT: callq use at PLT
+; CHECK-NEXT: andq %r15, %rbx
+; CHECK-NEXT: andq %r12, %r14
+; CHECK-NEXT: addq %r13, %r14
+; CHECK-NEXT: adcq %rbp, %rbx
+; CHECK-NEXT: movq %r14, %rax
+; CHECK-NEXT: movq %rbx, %rdx
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+start:
+ %xor = xor i128 %y, %x
+ call void @use(i128 %xor)
+ %lshr = lshr i128 %xor, 1
+ call void @use(i128 %lshr)
+ %and = and i128 %y, %x
+ %add = add i128 %lshr, %and
+ ret i128 %add
+}
+
+define i128 @avgflooru_i128_negative(i128 %x, i128 %y) {
+; CHECK-LABEL: avgflooru_i128_negative:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: notq %rsi
+; CHECK-NEXT: andq %rdi, %rdx
+; CHECK-NEXT: notq %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: adcq %rcx, %rsi
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: retq
+start:
+ %xor = xor i128 %x, -1
+ %and = and i128 %y, %x
+ %add = add i128 %xor, %and
+ ret i128 %add
+}
+
+define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) {
+; CHECK-LABEL: avgflooru_i128_negative2:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
+; CHECK-NEXT: retq
+start:
+ %xor = xor i32 %y, %x
+ %lshr = lshr i32 %xor, 1
+ %and = and i32 %y, %x
+ %add = add i32 %lshr, %and
+ ret i32 %add
+}
+
+define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
+; CHECK-LABEL: avgflooru_i128_vec:
+; CHECK: # %bb.0: # %start
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %r14, -16
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movq %r10, %r14
+; CHECK-NEXT: xorq %rsi, %r14
+; CHECK-NEXT: movq %rbx, %r11
+; CHECK-NEXT: xorq %rdx, %r11
+; CHECK-NEXT: shrdq $1, %r11, %r14
+; CHECK-NEXT: andq %rdx, %rbx
+; CHECK-NEXT: shrq %r11
+; CHECK-NEXT: andq %rsi, %r10
+; CHECK-NEXT: addq %r14, %r10
+; CHECK-NEXT: adcq %rbx, %r11
+; CHECK-NEXT: movq %r9, %rdx
+; CHECK-NEXT: xorq %rcx, %rdx
+; CHECK-NEXT: movq %rdi, %rsi
+; CHECK-NEXT: xorq %r8, %rsi
+; CHECK-NEXT: shrdq $1, %rsi, %rdx
+; CHECK-NEXT: andq %r8, %rdi
+; CHECK-NEXT: shrq %rsi
+; CHECK-NEXT: andq %rcx, %r9
+; CHECK-NEXT: addq %rdx, %r9
+; CHECK-NEXT: adcq %rdi, %rsi
+; CHECK-NEXT: movq %r9, 16(%rax)
+; CHECK-NEXT: movq %r10, (%rax)
+; CHECK-NEXT: movq %rsi, 24(%rax)
+; CHECK-NEXT: movq %r11, 8(%rax)
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+start:
+ %xor = xor <2 x i128> %y, %x
+ %lshr = lshr <2 x i128> %xor, <i128 1, i128 1>
+ %and = and <2 x i128> %y, %x
+ %add = add <2 x i128> %lshr, %and
+ ret <2 x i128> %add
+}
>From d3a909eb469b3ee12d83b7a76ed6929615d3a70b Mon Sep 17 00:00:00 2001
From: medievalghoul <61852278+medievalghoul at users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:16:50 -0400
Subject: [PATCH 2/3] Uploaded test with optimization
---
llvm/test/CodeGen/AArch64/avg-i128.ll | 57 ++++++++----------
llvm/test/CodeGen/X86/avg-i128.ll | 83 ++++++++++-----------------
2 files changed, 54 insertions(+), 86 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avg-i128.ll
index d1d2cd7dd0c89..75ee52decbb70 100644
--- a/llvm/test/CodeGen/AArch64/avg-i128.ll
+++ b/llvm/test/CodeGen/AArch64/avg-i128.ll
@@ -4,14 +4,12 @@
define i128 @avgflooru_i128(i128 %x, i128 %y) {
; CHECK-LABEL: avgflooru_i128:
; CHECK: // %bb.0: // %start
-; CHECK-NEXT: eor x8, x3, x1
-; CHECK-NEXT: eor x9, x2, x0
-; CHECK-NEXT: and x10, x2, x0
-; CHECK-NEXT: extr x9, x8, x9, #1
-; CHECK-NEXT: lsr x8, x8, #1
-; CHECK-NEXT: and x11, x3, x1
-; CHECK-NEXT: adds x0, x10, x9
-; CHECK-NEXT: adc x1, x11, x8
+; CHECK-NEXT: adds x9, x0, x2
+; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT: adcs x10, x1, x3
+; CHECK-NEXT: csel x1, x8, xzr, hs
+; CHECK-NEXT: extr x0, x10, x9, #1
+; CHECK-NEXT: bfxil x1, x10, #1, #63
; CHECK-NEXT: ret
start:
%xor = xor i128 %y, %x
@@ -47,18 +45,18 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) {
; CHECK-NEXT: mov x19, x3
; CHECK-NEXT: mov x20, x2
; CHECK-NEXT: bl use
-; CHECK-NEXT: extr x24, x23, x24, #1
-; CHECK-NEXT: lsr x23, x23, #1
-; CHECK-NEXT: mov x0, x24
-; CHECK-NEXT: mov x1, x23
+; CHECK-NEXT: extr x0, x23, x24, #1
+; CHECK-NEXT: lsr x1, x23, #1
; CHECK-NEXT: bl use
-; CHECK-NEXT: and x8, x20, x22
-; CHECK-NEXT: and x9, x19, x21
-; CHECK-NEXT: adds x0, x8, x24
+; CHECK-NEXT: adds x8, x22, x20
+; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT: adcs x9, x21, x19
; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: adc x1, x9, x23
; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: csel x1, x10, xzr, hs
; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: extr x0, x9, x8, #1
+; CHECK-NEXT: bfxil x1, x9, #1, #63
; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload
; CHECK-NEXT: ret
start:
@@ -107,23 +105,18 @@ start:
define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
; CHECK-LABEL: avgflooru_i128_vec:
; CHECK: // %bb.0: // %start
-; CHECK-NEXT: eor x8, x4, x0
-; CHECK-NEXT: eor x9, x5, x1
-; CHECK-NEXT: eor x11, x6, x2
-; CHECK-NEXT: extr x8, x9, x8, #1
-; CHECK-NEXT: eor x12, x7, x3
-; CHECK-NEXT: and x13, x4, x0
-; CHECK-NEXT: lsr x9, x9, #1
-; CHECK-NEXT: extr x11, x12, x11, #1
-; CHECK-NEXT: and x10, x5, x1
-; CHECK-NEXT: adds x0, x13, x8
-; CHECK-NEXT: lsr x8, x12, #1
-; CHECK-NEXT: and x12, x6, x2
-; CHECK-NEXT: adc x1, x10, x9
-; CHECK-NEXT: adds x10, x12, x11
-; CHECK-NEXT: and x9, x7, x3
+; CHECK-NEXT: adds x8, x0, x4
+; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT: adcs x9, x1, x5
+; CHECK-NEXT: csel x1, x10, xzr, hs
+; CHECK-NEXT: adds x11, x2, x6
+; CHECK-NEXT: extr x0, x9, x8, #1
+; CHECK-NEXT: adcs x12, x3, x7
+; CHECK-NEXT: bfxil x1, x9, #1, #63
+; CHECK-NEXT: csel x3, x10, xzr, hs
+; CHECK-NEXT: extr x10, x12, x11, #1
+; CHECK-NEXT: bfxil x3, x12, #1, #63
; CHECK-NEXT: fmov d0, x10
-; CHECK-NEXT: adc x3, x9, x8
; CHECK-NEXT: mov v0.d[1], x3
; CHECK-NEXT: fmov x2, d0
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avg-i128.ll
index 82827720865dc..e0e3283c308d7 100644
--- a/llvm/test/CodeGen/X86/avg-i128.ll
+++ b/llvm/test/CodeGen/X86/avg-i128.ll
@@ -4,17 +4,13 @@
define i128 @avgflooru_i128(i128 %x, i128 %y) {
; CHECK-LABEL: avgflooru_i128:
; CHECK: # %bb.0: # %start
-; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: xorq %rdi, %rax
-; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: xorq %rsi, %r8
-; CHECK-NEXT: shrdq $1, %r8, %rax
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: shrq %r8
-; CHECK-NEXT: andq %rdi, %rdx
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: addq %rdx, %rax
-; CHECK-NEXT: adcq %rcx, %r8
-; CHECK-NEXT: movq %r8, %rdx
+; CHECK-NEXT: adcq %rcx, %rsi
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: shrdq $1, %rsi, %rax
+; CHECK-NEXT: movzbl %cl, %edx
+; CHECK-NEXT: shldq $63, %rsi, %rdx
; CHECK-NEXT: retq
start:
%xor = xor i128 %y, %x
@@ -65,12 +61,13 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) {
; CHECK-NEXT: movq %r13, %rdi
; CHECK-NEXT: movq %rbp, %rsi
; CHECK-NEXT: callq use at PLT
-; CHECK-NEXT: andq %r15, %rbx
-; CHECK-NEXT: andq %r12, %r14
-; CHECK-NEXT: addq %r13, %r14
-; CHECK-NEXT: adcq %rbp, %rbx
-; CHECK-NEXT: movq %r14, %rax
-; CHECK-NEXT: movq %rbx, %rdx
+; CHECK-NEXT: addq %r14, %r12
+; CHECK-NEXT: adcq %rbx, %r15
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: shrdq $1, %r15, %r12
+; CHECK-NEXT: movzbl %al, %edx
+; CHECK-NEXT: shldq $63, %r15, %rdx
+; CHECK-NEXT: movq %r12, %rax
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 56
; CHECK-NEXT: popq %rbx
@@ -135,45 +132,23 @@ start:
define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
; CHECK-LABEL: avgflooru_i128_vec:
; CHECK: # %bb.0: # %start
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: .cfi_offset %r14, -16
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: movq %r10, %r14
-; CHECK-NEXT: xorq %rsi, %r14
-; CHECK-NEXT: movq %rbx, %r11
-; CHECK-NEXT: xorq %rdx, %r11
-; CHECK-NEXT: shrdq $1, %r11, %r14
-; CHECK-NEXT: andq %rdx, %rbx
-; CHECK-NEXT: shrq %r11
-; CHECK-NEXT: andq %rsi, %r10
-; CHECK-NEXT: addq %r14, %r10
-; CHECK-NEXT: adcq %rbx, %r11
-; CHECK-NEXT: movq %r9, %rdx
-; CHECK-NEXT: xorq %rcx, %rdx
-; CHECK-NEXT: movq %rdi, %rsi
-; CHECK-NEXT: xorq %r8, %rsi
-; CHECK-NEXT: shrdq $1, %rsi, %rdx
-; CHECK-NEXT: andq %r8, %rdi
-; CHECK-NEXT: shrq %rsi
-; CHECK-NEXT: andq %rcx, %r9
-; CHECK-NEXT: addq %rdx, %r9
-; CHECK-NEXT: adcq %rdi, %rsi
-; CHECK-NEXT: movq %r9, 16(%rax)
-; CHECK-NEXT: movq %r10, (%rax)
-; CHECK-NEXT: movq %rsi, 24(%rax)
-; CHECK-NEXT: movq %r11, 8(%rax)
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: setb %dil
+; CHECK-NEXT: movzbl %dil, %edi
+; CHECK-NEXT: shldq $63, %rdx, %rdi
+; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT: setb %r9b
+; CHECK-NEXT: movzbl %r9b, %r9d
+; CHECK-NEXT: shldq $63, %r8, %r9
+; CHECK-NEXT: shldq $63, %rsi, %rdx
+; CHECK-NEXT: shldq $63, %rcx, %r8
+; CHECK-NEXT: movq %r8, 16(%rax)
+; CHECK-NEXT: movq %rdx, (%rax)
+; CHECK-NEXT: movq %r9, 24(%rax)
+; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: retq
start:
%xor = xor <2 x i128> %y, %x
>From 8881fca9573772d6b89fc6b03451a5e55062b0ef Mon Sep 17 00:00:00 2001
From: medievalghoul <61852278+medievalghoul at users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:18:58 -0400
Subject: [PATCH 3/3] the legalization of i128
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 24 ++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c3a20b5044c5f..92795bd37a562 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9318,7 +9318,8 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
assert((Opc == ISD::AVGFLOORS || Opc == ISD::AVGCEILS ||
Opc == ISD::AVGFLOORU || Opc == ISD::AVGCEILU) &&
- "Unknown AVG node");
+ "Unknown AVG node");
+ EVT SVT = VT.getScalarType();
// If the operands are already extended, we can add+shift.
bool IsExt =
@@ -9352,6 +9353,27 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const {
}
}
+ if (Opc == ISD::AVGFLOORU && SVT == MVT::i128) {
+ SDValue UAddWithOverflow = DAG.getNode(ISD::UADDO, dl,
+ DAG.getVTList(VT, MVT::i1), { RHS, LHS });
+
+ SDValue Sum = UAddWithOverflow.getValue(0);
+ SDValue Overflow = UAddWithOverflow.getValue(1);
+
+ // Right shift the sum by 1
+ SDValue One = DAG.getConstant(1, dl, VT);
+ SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One);
+
+ // Creating the select instruction
+ APInt SignMin = APInt::getSignedMinValue(VT.getSizeInBits());
+ SDValue SignMinVal = DAG.getConstant(SignMin, dl, VT);
+ SDValue ZeroOut = DAG.getConstant(0, dl, VT);
+
+ SDValue SelectVal = DAG.getSelect(dl, VT, Overflow, SignMinVal, ZeroOut);
+
+ return DAG.getNode(ISD::OR, dl, VT, LShrVal, SelectVal);
+ }
+
// avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1))
// avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1))
// avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1))
More information about the llvm-commits
mailing list