[llvm] [AMDGPU] Move `AMDGPUAttributorPass` to full LTO post link stage (PR #102086)

Fri Aug 9 06:31:01 PDT 2024

https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/102086

>From 39e4b4af033ee39038e7a0d265bebcfdb3c213cc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 9 Aug 2024 12:51:05 +0400
Subject: [PATCH 1/5] TTI: Check legalization cost of abs nodes (#100523)

---
 .../Analysis/CostModel/AMDGPU/arith-ssat.ll   | 72 +++++++++++++++++++
 .../Analysis/CostModel/AMDGPU/arith-usat.ll   | 72 +++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index 3c80ada6b51435..df3e302ac4a4d9 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -104,6 +104,7 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.sadd.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -125,6 +126,29 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+=======
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+>>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'add'
@@ -226,6 +250,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V5I64 = call <5 x i64> @llvm.ssub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+<<<<<<< HEAD
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -247,6 +272,29 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+=======
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+>>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'sub'
@@ -284,6 +332,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.ssub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -305,6 +354,29 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+=======
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+>>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'sub'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
index f47e03ebac8280..b8d560e0ca3b81 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
@@ -104,6 +104,7 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.uadd.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -125,6 +126,29 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+=======
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+>>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'add'
@@ -226,6 +250,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V5I64 = call <5 x i64> @llvm.usub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+<<<<<<< HEAD
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -247,6 +272,29 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+=======
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+>>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'sub'
@@ -284,6 +332,7 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.usub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -305,6 +354,29 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+=======
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+>>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'sub'

>From 110403c257821985c61b857cc954efb3f1ffd598 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 9 Aug 2024 12:55:15 +0400
Subject: [PATCH 2/5] AMDGPU: Correct costs of saturating add/sub intrinsics
 (#100808)

These are directly legal with fast instructions.
---
 .../Analysis/CostModel/AMDGPU/arith-ssat.ll   | 72 -------------------
 .../Analysis/CostModel/AMDGPU/arith-usat.ll   | 72 -------------------
 2 files changed, 144 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index df3e302ac4a4d9..3c80ada6b51435 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -104,7 +104,6 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.sadd.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -126,29 +125,6 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-=======
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.sadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.sadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
->>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'add'
@@ -250,7 +226,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V5I64 = call <5 x i64> @llvm.ssub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-<<<<<<< HEAD
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -272,29 +247,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-=======
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
->>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'sub'
@@ -332,7 +284,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.ssub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -354,29 +305,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-=======
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.ssub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.ssub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
->>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'sub'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
index b8d560e0ca3b81..f47e03ebac8280 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
@@ -104,7 +104,6 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.uadd.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -126,29 +125,6 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-=======
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.uadd.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.uadd.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
->>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'add'
@@ -250,7 +226,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V5I64 = call <5 x i64> @llvm.usub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-<<<<<<< HEAD
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -272,29 +247,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-=======
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
->>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'sub'
@@ -332,7 +284,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V5I64 = call <5 x i64> @llvm.usub.sat.v5i64(<5 x i64> undef, <5 x i64> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
-<<<<<<< HEAD
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
@@ -354,29 +305,6 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
-=======
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V9I32 = call <9 x i32> @llvm.usub.sat.v9i32(<9 x i32> undef, <9 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V33I8 = call <33 x i8> @llvm.usub.sat.v33i8(<33 x i8> undef, <33 x i8> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 132 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
->>>>>>> d7824fab6ee1 (TTI: Check legalization cost of abs nodes (#100523))
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'sub'

>From af0b8691fca6de138bff2756e431e71febe80258 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau at arm.com>
Date: Fri, 9 Aug 2024 11:12:43 +0100
Subject: [PATCH 3/5] Enable logf128 constant folding for hosts with 128bit
 floats (#96287)

Hosts which support a float size of 128 bits can benefit from constant
fp128 folding.
---
 llvm/CMakeLists.txt                   |  2 --
 llvm/cmake/config-ix.cmake            | 17 ++++++-----------
 llvm/include/llvm/ADT/APFloat.h       |  4 ++--
 llvm/include/llvm/Support/float128.h  | 14 ++++++--------
 llvm/lib/Analysis/CMakeLists.txt      |  6 ------
 llvm/lib/Analysis/ConstantFolding.cpp |  4 ++--
 llvm/lib/Support/APFloat.cpp          |  4 ++--
 7 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 51f99cb6962575..3cd6ff4038c69e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -560,8 +560,6 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR
 
 set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON")
 
-set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON")
-
 set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 0aae13e30f2ab4..5e8c7c73a690d7 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -247,17 +247,6 @@ else()
   set(HAVE_LIBEDIT 0)
 endif()
 
-if(LLVM_HAS_LOGF128)
-  include(CheckCXXSymbolExists)
-  check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
-
-  if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128)
-    message(FATAL_ERROR "Failed to configure logf128")
-  endif()
-
-  set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
-endif()
-
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
@@ -271,6 +260,12 @@ if(C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW)
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=unguarded-availability-new")
 endif()
 
+check_cxx_symbol_exists(logf128 cmath HAS_LOGF128)
+if(HAS_LOGF128)
+    set(LLVM_HAS_LOGF128 On)
+    add_compile_definitions(HAS_LOGF128)
+endif()
+
 # Determine whether we can register EH tables.
 check_symbol_exists(__register_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_REGISTER_FRAME)
 check_symbol_exists(__deregister_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_DEREGISTER_FRAME)
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 7039e961bff82d..374fbccf59cc26 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -378,7 +378,7 @@ class IEEEFloat final : public APFloatBase {
   Expected<opStatus> convertFromString(StringRef, roundingMode);
   APInt bitcastToAPInt() const;
   double convertToDouble() const;
-#ifdef HAS_IEE754_FLOAT128
+#if defined(HAS_IEE754_FLOAT128)
   float128 convertToQuad() const;
 #endif
   float convertToFloat() const;
@@ -1279,7 +1279,7 @@ class APFloat : public APFloatBase {
   /// \pre The APFloat must be built using semantics, that can be represented by
   /// the host float type without loss of precision. It can be IEEEquad and
   /// shorter semantics, like IEEEdouble and others.
-#ifdef HAS_IEE754_FLOAT128
+#if defined(HAS_IEE754_FLOAT128)
   float128 convertToQuad() const;
 #endif
 
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
index e15a98dc5a6779..618b320086ba59 100644
--- a/llvm/include/llvm/Support/float128.h
+++ b/llvm/include/llvm/Support/float128.h
@@ -9,18 +9,16 @@
 #ifndef LLVM_FLOAT128
 #define LLVM_FLOAT128
 
+#include <cmath>
+
 namespace llvm {
 
-#if defined(__clang__) && defined(__FLOAT128__) &&                             \
-    defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
-#define HAS_IEE754_FLOAT128
-typedef __float128 float128;
-#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
-    !defined(__LONG_DOUBLE_IBM128__) &&                                        \
-    (defined(__GNUC__) || defined(__GNUG__))
+#ifdef HAS_LOGF128
+#if !defined(__LONG_DOUBLE_IBM128__) && (__SIZEOF_INT128__ == 16)
+typedef decltype(logf128(0.)) float128;
 #define HAS_IEE754_FLOAT128
-typedef _Float128 float128;
 #endif
+#endif // HAS_LOGF128
 
 } // namespace llvm
 #endif // LLVM_FLOAT128
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 2cb3547ec40473..63b4f71c0a0e67 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -161,9 +161,3 @@ add_llvm_component_library(LLVMAnalysis
   Support
   TargetParser
   )
-
-include(CheckCXXSymbolExists)
-check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
-if(HAS_LOGF128)
- target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128)
-endif()
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 084647b1d9d938..87feb2ecd002f9 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1781,7 +1781,7 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
   return GetConstantFoldFPValue(Result, Ty);
 }
 
-#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
+#if defined(HAS_IEE754_FLOAT128)
 Constant *ConstantFoldFP128(float128 (*NativeFP)(float128), const APFloat &V,
                             Type *Ty) {
   llvm_fenv_clearexcept();
@@ -2114,7 +2114,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     if (IntrinsicID == Intrinsic::canonicalize)
       return constantFoldCanonicalize(Ty, Call, U);
 
-#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
+#if defined(HAS_IEE754_FLOAT128)
     if (Ty->isFP128Ty()) {
       if (IntrinsicID == Intrinsic::log) {
         float128 Result = logf128(Op->getValueAPF().convertToQuad());
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 7f68c5ab9b7cf7..000fb4ca28ba23 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -3749,7 +3749,7 @@ double IEEEFloat::convertToDouble() const {
   return api.bitsToDouble();
 }
 
-#ifdef HAS_IEE754_FLOAT128
+#if defined(HAS_IEE754_FLOAT128)
 float128 IEEEFloat::convertToQuad() const {
   assert(semantics == (const llvm::fltSemantics *)&semIEEEquad &&
          "Float semantics are not IEEEquads");
@@ -5406,7 +5406,7 @@ double APFloat::convertToDouble() const {
   return Temp.getIEEE().convertToDouble();
 }
 
-#ifdef HAS_IEE754_FLOAT128
+#if defined(HAS_IEE754_FLOAT128)
 float128 APFloat::convertToQuad() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
     return getIEEE().convertToQuad();

>From 59fb1dd4990c919b5507de64ec40aa0d6f9f9ec2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 9 Aug 2024 15:11:11 +0200
Subject: [PATCH 4/5] Revert "Enable logf128 constant folding for hosts with
 128bit floats (#96287)"

This reverts commit ccb2b011e577e861254f61df9c59494e9e122b38.

Causes buildbot failures, e.g. on ppc64le builders.
---
 llvm/CMakeLists.txt                   |  2 ++
 llvm/cmake/config-ix.cmake            | 17 +++++++++++------
 llvm/include/llvm/ADT/APFloat.h       |  4 ++--
 llvm/include/llvm/Support/float128.h  | 14 ++++++++------
 llvm/lib/Analysis/CMakeLists.txt      |  6 ++++++
 llvm/lib/Analysis/ConstantFolding.cpp |  4 ++--
 llvm/lib/Support/APFloat.cpp          |  4 ++--
 7 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3cd6ff4038c69e..51f99cb6962575 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -560,6 +560,8 @@ set(LLVM_USE_STATIC_ZSTD FALSE CACHE BOOL "Use static version of zstd. Can be TR
 
 set(LLVM_ENABLE_CURL "OFF" CACHE STRING "Use libcurl for the HTTP client if available. Can be ON, OFF, or FORCE_ON")
 
+set(LLVM_HAS_LOGF128 "OFF" CACHE STRING "Use logf128 to constant fold fp128 logarithm calls. Can be ON, OFF, or FORCE_ON")
+
 set(LLVM_ENABLE_HTTPLIB "OFF" CACHE STRING "Use cpp-httplib HTTP server library if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_Z3_INSTALL_DIR "" CACHE STRING "Install directory of the Z3 solver.")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 5e8c7c73a690d7..0aae13e30f2ab4 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -247,6 +247,17 @@ else()
   set(HAVE_LIBEDIT 0)
 endif()
 
+if(LLVM_HAS_LOGF128)
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+
+  if(LLVM_HAS_LOGF128 STREQUAL FORCE_ON AND NOT HAS_LOGF128)
+    message(FATAL_ERROR "Failed to configure logf128")
+  endif()
+
+  set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
+endif()
+
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
 find_package(Backtrace)
@@ -260,12 +271,6 @@ if(C_SUPPORTS_WERROR_UNGUARDED_AVAILABILITY_NEW)
   set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=unguarded-availability-new")
 endif()
 
-check_cxx_symbol_exists(logf128 cmath HAS_LOGF128)
-if(HAS_LOGF128)
-    set(LLVM_HAS_LOGF128 On)
-    add_compile_definitions(HAS_LOGF128)
-endif()
-
 # Determine whether we can register EH tables.
 check_symbol_exists(__register_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_REGISTER_FRAME)
 check_symbol_exists(__deregister_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_DEREGISTER_FRAME)
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 374fbccf59cc26..7039e961bff82d 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -378,7 +378,7 @@ class IEEEFloat final : public APFloatBase {
   Expected<opStatus> convertFromString(StringRef, roundingMode);
   APInt bitcastToAPInt() const;
   double convertToDouble() const;
-#if defined(HAS_IEE754_FLOAT128)
+#ifdef HAS_IEE754_FLOAT128
   float128 convertToQuad() const;
 #endif
   float convertToFloat() const;
@@ -1279,7 +1279,7 @@ class APFloat : public APFloatBase {
   /// \pre The APFloat must be built using semantics, that can be represented by
   /// the host float type without loss of precision. It can be IEEEquad and
   /// shorter semantics, like IEEEdouble and others.
-#if defined(HAS_IEE754_FLOAT128)
+#ifdef HAS_IEE754_FLOAT128
   float128 convertToQuad() const;
 #endif
 
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
index 618b320086ba59..e15a98dc5a6779 100644
--- a/llvm/include/llvm/Support/float128.h
+++ b/llvm/include/llvm/Support/float128.h
@@ -9,16 +9,18 @@
 #ifndef LLVM_FLOAT128
 #define LLVM_FLOAT128
 
-#include <cmath>
-
 namespace llvm {
 
-#ifdef HAS_LOGF128
-#if !defined(__LONG_DOUBLE_IBM128__) && (__SIZEOF_INT128__ == 16)
-typedef decltype(logf128(0.)) float128;
+#if defined(__clang__) && defined(__FLOAT128__) &&                             \
+    defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
+#define HAS_IEE754_FLOAT128
+typedef __float128 float128;
+#elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
+    !defined(__LONG_DOUBLE_IBM128__) &&                                        \
+    (defined(__GNUC__) || defined(__GNUG__))
 #define HAS_IEE754_FLOAT128
+typedef _Float128 float128;
 #endif
-#endif // HAS_LOGF128
 
 } // namespace llvm
 #endif // LLVM_FLOAT128
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 63b4f71c0a0e67..2cb3547ec40473 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -161,3 +161,9 @@ add_llvm_component_library(LLVMAnalysis
   Support
   TargetParser
   )
+
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(logf128 math.h HAS_LOGF128)
+if(HAS_LOGF128)
+ target_compile_definitions(LLVMAnalysis PRIVATE HAS_LOGF128)
+endif()
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 87feb2ecd002f9..084647b1d9d938 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1781,7 +1781,7 @@ Constant *ConstantFoldFP(double (*NativeFP)(double), const APFloat &V,
   return GetConstantFoldFPValue(Result, Ty);
 }
 
-#if defined(HAS_IEE754_FLOAT128)
+#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
 Constant *ConstantFoldFP128(float128 (*NativeFP)(float128), const APFloat &V,
                             Type *Ty) {
   llvm_fenv_clearexcept();
@@ -2114,7 +2114,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     if (IntrinsicID == Intrinsic::canonicalize)
       return constantFoldCanonicalize(Ty, Call, U);
 
-#if defined(HAS_IEE754_FLOAT128)
+#if defined(HAS_IEE754_FLOAT128) && defined(HAS_LOGF128)
     if (Ty->isFP128Ty()) {
       if (IntrinsicID == Intrinsic::log) {
         float128 Result = logf128(Op->getValueAPF().convertToQuad());
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 000fb4ca28ba23..7f68c5ab9b7cf7 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -3749,7 +3749,7 @@ double IEEEFloat::convertToDouble() const {
   return api.bitsToDouble();
 }
 
-#if defined(HAS_IEE754_FLOAT128)
+#ifdef HAS_IEE754_FLOAT128
 float128 IEEEFloat::convertToQuad() const {
   assert(semantics == (const llvm::fltSemantics *)&semIEEEquad &&
          "Float semantics are not IEEEquads");
@@ -5406,7 +5406,7 @@ double APFloat::convertToDouble() const {
   return Temp.getIEEE().convertToDouble();
 }
 
-#if defined(HAS_IEE754_FLOAT128)
+#ifdef HAS_IEE754_FLOAT128
 float128 APFloat::convertToQuad() const {
   if (&getSemantics() == (const llvm::fltSemantics *)&semIEEEquad)
     return getIEEE().convertToQuad();

>From 659844f5dc23246fd06fd8b9ca667c2a13b0849b Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Mon, 5 Aug 2024 20:27:47 -0400
Subject: [PATCH 5/5] [AMDGPU] Add an option to run `AMDGPUAttributorPass` in
 full LTO

Currently `AMDGPUAttributorPass` is registered in default optimizer pipeline.
This will allow the pass to run in default pipeline as well as at thinLTO post
link stage. However, it will not run in full LTO post link stage. This patch
adds an option to run it in full LTO. It is set to false by default.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 15 +++---
 .../AMDGPU/amdgpu-simplify-libcall-sincos.ll  | 48 +++++++++----------
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |  2 +-
 llvm/test/CodeGen/AMDGPU/inline-attr.ll       | 12 ++---
 .../CodeGen/AMDGPU/print-pipeline-passes.ll   | 10 ++++
 llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll |  2 +-
 6 files changed, 49 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c8fb68d1c0b0c3..fce4f96c11d6e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -301,6 +301,11 @@ static cl::opt<bool> EnableSIModeRegisterPass(
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool>
+    EnableAMDGPUAttributor("amdgpu-attributor-enable",
+                           cl::desc("Enable AMDGPUAttributorPass"),
+                           cl::init(true), cl::Hidden);
+
 // Enable GFX11.5+ s_singleuse_vdst insertion
 static cl::opt<bool>
     EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
@@ -734,14 +739,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
 
-  // FIXME: Why is AMDGPUAttributor not in CGSCC?
-  PB.registerOptimizerLastEPCallback(
-      [this](ModulePassManager &MPM, OptimizationLevel Level) {
-        if (Level != OptimizationLevel::O0) {
-          MPM.addPass(AMDGPUAttributorPass(*this));
-        }
-      });
-
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         // We want to support the -lto-partitions=N option as "best effort".
@@ -749,6 +746,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // module is partitioned for codegen.
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
+        if (EnableAMDGPUAttributor && Level != OptimizationLevel::O0)
+          PM.addPass(AMDGPUAttributorPass(*this));
       });
 
   PB.registerRegClassFilterParsingCallback(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index 1358d91ae102c9..a35fbaadddf9ef 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -240,7 +240,7 @@ entry:
 
 define void @sincos_v2f32_nocontract(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f32_nocontract
-; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -298,7 +298,7 @@ entry:
 
 define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f32
-; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -317,7 +317,7 @@ entry:
 
 define void @sincos_v3f32(<3 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v3f32
-; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <3 x float>, align 16, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <3 x float> @_Z6sincosDv3_fPU3AS5S_(<3 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -340,7 +340,7 @@ entry:
 
 define void @sincos_v4f32(<4 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v4f32
-; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <4 x float>, align 16, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <4 x float> @_Z6sincosDv4_fPU3AS5S_(<4 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -359,7 +359,7 @@ entry:
 
 define void @sincos_v8f32(<8 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v8f32
-; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <8 x float>, align 32, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <8 x float> @_Z6sincosDv8_fPU3AS5S_(<8 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -378,7 +378,7 @@ entry:
 
 define void @sincos_v16f32(<16 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v16f32
-; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <16 x float>, align 64, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <16 x float> @_Z6sincosDv16_fPU3AS5S_(<16 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -397,7 +397,7 @@ entry:
 
 define void @sincos_f64_nocontract(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f64_nocontract
-; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -417,7 +417,7 @@ entry:
 
 define void @sincos_v2f64_nocontract(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f64_nocontract
-; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -436,7 +436,7 @@ entry:
 
 define void @sincos_f64(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f64
-; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -455,7 +455,7 @@ entry:
 
 define void @sincos_f64_order1(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f64_order1
-; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract double @_Z6sincosdPU3AS5d(double [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -474,7 +474,7 @@ entry:
 
 define void @sincos_v2f64(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f64
-; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x double> @_Z6sincosDv2_dPU3AS5S_(<2 x double> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -493,7 +493,7 @@ entry:
 
 define void @sincos_v3f64(<3 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v3f64
-; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <3 x double>, align 32, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <3 x double> @_Z6sincosDv3_dPU3AS5S_(<3 x double> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -516,7 +516,7 @@ entry:
 
 define void @sincos_v4f64(<4 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v4f64
-; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <4 x double>, align 32, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <4 x double> @_Z6sincosDv4_dPU3AS5S_(<4 x double> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -535,7 +535,7 @@ entry:
 
 define void @sincos_v8f64(<8 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v8f64
-; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <8 x double>, align 64, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <8 x double> @_Z6sincosDv8_dPU3AS5S_(<8 x double> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -554,7 +554,7 @@ entry:
 
 define void @sincos_v16f64(<16 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v16f64
-; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <16 x double>, align 128, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract <16 x double> @_Z6sincosDv16_dPU3AS5S_(<16 x double> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -666,7 +666,7 @@ bb1:
 
 define float @select_sin_or_cos_f32(i1 %cond, float %x) {
 ; CHECK-LABEL: define float @select_sin_or_cos_f32
-; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -685,7 +685,7 @@ declare void @func(ptr addrspace(1))
 
 define void @sincos_f32_value_is_instr(ptr addrspace(1) %value.ptr, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_value_is_instr
-; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    tail call void @func(ptr addrspace(1) [[VALUE_PTR]])
@@ -838,7 +838,7 @@ entry:
 
 define void @sincos_v2f32_flag_intersect1(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f32_flag_intersect1
-; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call nnan contract <2 x float> @_Z6sincosDv2_fPU3AS5S_(<2 x float> [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -859,7 +859,7 @@ declare void @use_stack_ptrs(ptr addrspace(5), ptr addrspace(5))
 
 define void @sincos_f32_alloca_insertpt(float %x) {
 ; CHECK-LABEL: define void @sincos_f32_alloca_insertpt
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[ALLOCA1:%.*]] = alloca i32, align 4, addrspace(5)
@@ -884,7 +884,7 @@ entry:
 
 define float @sincos_f32_unused_result_cos(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_cos
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SIN:%.*]] = tail call contract float @_Z3sinf(float [[X]])
 ; CHECK-NEXT:    ret float [[SIN]]
@@ -899,7 +899,7 @@ entry:
 
 define float @sincos_f32_unused_result_sin(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_sin
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR6]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS:%.*]] = tail call contract float @_Z3cosf(float [[X]])
 ; CHECK-NEXT:    ret float [[COS]]
@@ -914,7 +914,7 @@ entry:
 
 define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_repeated_uses
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
@@ -947,7 +947,7 @@ entry:
 
 define void @sin_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) {
 ; CHECK-LABEL: define void @sin_f32_indirect_call_user
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
 ; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
@@ -965,7 +965,7 @@ entry:
 
 define void @cos_f32_indirect_call_user(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr %func.ptr) {
 ; CHECK-LABEL: define void @cos_f32_indirect_call_user
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr nocapture readonly [[FUNC_PTR:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3cosf(float [[X]])
 ; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[COS_OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 90562e25a3e9c1..9ce0f55255b8fc 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -1,4 +1,4 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O2 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes="lto<O2>" | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index e7a7b8a335d0d3..10b226e29ce220 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -6,14 +6,14 @@
 ; GCN: define amdgpu_kernel void @caller(ptr addrspace(1) nocapture %p) local_unnamed_addr #1 {
 ; GCN: %mul.i = fmul float %load, 1.500000e+01
 
-; UNSAFE: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #0 = { nounwind "unsafe-fp-math"="true" }
+; UNSAFE: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="true" }
 
-; NOINFS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-infs-fp-math"="true" "uniform-work-group-size"="false" }
-; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="false" }
+; NOINFS: attributes #0 = { nounwind "no-infs-fp-math"="true" }
+; NOINFS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "unsafe-fp-math"="false" }
 
-; NONANS: attributes #0 = { nounwind "amdgpu-waves-per-eu"="4,10" "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
-; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="false" }
+; NONANS: attributes #0 = { nounwind "no-nans-fp-math"="true" }
+; NONANS: attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "unsafe-fp-math"="false" }
 
 declare void @extern() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
new file mode 100644
index 00000000000000..ab8fcee03d1d80
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll
@@ -0,0 +1,10 @@
+; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O1>" -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O2>" -print-pipeline-passes %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -passes="lto<O3>" -print-pipeline-passes %s -o - | FileCheck %s
+
+; CHECK: amdgpu-attributor
+
+define amdgpu_kernel void @kernel() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index bf21ed6898ff46..ecc03feff2bf46 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -826,5 +826,5 @@ entry:
 ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
 
 ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind }
-; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) }
 attributes #0 = { nounwind }