[llvm] a5b662a - [SelectionDAG] Correctly widen bitcast of scalar to vector for big endian

Thu Feb 2 10:01:23 PST 2023

Author: Nemanja Ivanovic
Date: 2023-02-02T12:01:14-06:00
New Revision: a5b662a834fde4188a26fd83ed26745fc4c1bc06

URL: https://github.com/llvm/llvm-project/commit/a5b662a834fde4188a26fd83ed26745fc4c1bc06
DIFF: https://github.com/llvm/llvm-project/commit/a5b662a834fde4188a26fd83ed26745fc4c1bc06.diff

LOG: [SelectionDAG] Correctly widen bitcast of scalar to vector for big endian

For big endian targets that need a node such as this:
v2i8 = bitcast i16:tN

legalized by:

1. Promoting the i16 input type
2. Widening the v2i32 result type

The result will be incorrect because the legalizer will promote
the input type and then produce a scalar_to_vector from that
wider type to a vector of N elements of that type. That puts
the desired bits into the low order bytes of element zero and
they need to be in the high order bytes on big endian systems.
This patch changes the legalization to widen to a vector with
elements of the original scalar size.

Differential revision: https://reviews.llvm.org/D140365

Added: 
    llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
    llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index af5ea1ce5f459..630f5b859f370 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4863,7 +4863,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
       NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
                                  WidenSize / InEltVT.getSizeInBits());
     } else {
-      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumParts);
+      // For big endian systems, using the promoted input scalar type
+      // to produce the scalar_to_vector would put the desired bits into
+      // the least significant byte(s) of the wider element zero. This
+      // will mean that the users of the result vector are using incorrect
+      // bits. Use the original input type instead. Although either input
+      // type can be used on little endian systems, for consistency we
+      // use the original type there as well.
+      EVT OrigInVT = N->getOperand(0).getValueType();
+      NewNumParts = WidenSize / OrigInVT.getSizeInBits();
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), OrigInVT, NewNumParts);
     }
 
     if (TLI.isTypeLegal(NewInVT)) {

diff  --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
index 37e986d17e6f4..fe98e94ae79b1 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp32_elts.ll
@@ -31,7 +31,8 @@ define i64 @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
 ;
 ; CHECK-P9-LABEL: test2elt:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    mtfprd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
 ; CHECK-P9-NEXT:    vextractub v3, v2, 15
 ; CHECK-P9-NEXT:    vextractub v2, v2, 14
 ; CHECK-P9-NEXT:    xscvuxdsp f0, v3
@@ -44,7 +45,8 @@ define i64 @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
 ;
 ; CHECK-BE-LABEL: test2elt:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r3
 ; CHECK-BE-NEXT:    vextractub v3, v2, 2
 ; CHECK-BE-NEXT:    vextractub v2, v2, 0
 ; CHECK-BE-NEXT:    xscvuxdsp f0, v3
@@ -293,7 +295,8 @@ define i64 @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
 ;
 ; CHECK-P9-LABEL: test2elt_signed:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    mtvsrws v2, r3
+; CHECK-P9-NEXT:    mtfprd f0, r3
+; CHECK-P9-NEXT:    xxswapd v2, vs0
 ; CHECK-P9-NEXT:    vextractub v3, v2, 15
 ; CHECK-P9-NEXT:    vextractub v2, v2, 14
 ; CHECK-P9-NEXT:    vextsh2d v3, v3
@@ -308,7 +311,8 @@ define i64 @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
 ;
 ; CHECK-BE-LABEL: test2elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    mtvsrws v2, r3
+; CHECK-BE-NEXT:    sldi r3, r3, 48
+; CHECK-BE-NEXT:    mtvsrd v2, r3
 ; CHECK-BE-NEXT:    vextractub v3, v2, 2
 ; CHECK-BE-NEXT:    vextractub v2, v2, 0
 ; CHECK-BE-NEXT:    vextsh2d v3, v3

diff  --git a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
index 5dabe65fd68db..434d0f660f2e9 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i8_to_fp64_elts.ll
@@ -13,7 +13,7 @@ define <2 x double> @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
 ; CHECK-P8-LABEL: test2elt:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    addis r4, r2, .LCPI0_0 at toc@ha
-; CHECK-P8-NEXT:    mtvsrwz v2, r3
+; CHECK-P8-NEXT:    mtvsrd v2, r3
 ; CHECK-P8-NEXT:    addi r4, r4, .LCPI0_0 at toc@l
 ; CHECK-P8-NEXT:    xxlxor v4, v4, v4
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r4
@@ -24,7 +24,7 @@ define <2 x double> @test2elt(i16 %a.coerce) local_unnamed_addr #0 {
 ;
 ; CHECK-P9-LABEL: test2elt:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    mtfprwz f0, r3
+; CHECK-P9-NEXT:    mtfprd f0, r3
 ; CHECK-P9-NEXT:    addis r3, r2, .LCPI0_0 at toc@ha
 ; CHECK-P9-NEXT:    xxlxor vs2, vs2, vs2
 ; CHECK-P9-NEXT:    addi r3, r3, .LCPI0_0 at toc@l
@@ -415,7 +415,7 @@ define <2 x double> @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
 ; CHECK-P8-LABEL: test2elt_signed:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    addis r4, r2, .LCPI4_0 at toc@ha
-; CHECK-P8-NEXT:    mtvsrwz v3, r3
+; CHECK-P8-NEXT:    mtvsrd v3, r3
 ; CHECK-P8-NEXT:    addis r3, r2, .LCPI4_1 at toc@ha
 ; CHECK-P8-NEXT:    addi r4, r4, .LCPI4_0 at toc@l
 ; CHECK-P8-NEXT:    addi r3, r3, .LCPI4_1 at toc@l
@@ -431,7 +431,7 @@ define <2 x double> @test2elt_signed(i16 %a.coerce) local_unnamed_addr #0 {
 ;
 ; CHECK-P9-LABEL: test2elt_signed:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    mtvsrwz v2, r3
+; CHECK-P9-NEXT:    mtvsrd v2, r3
 ; CHECK-P9-NEXT:    addis r3, r2, .LCPI4_0 at toc@ha
 ; CHECK-P9-NEXT:    addi r3, r3, .LCPI4_0 at toc@l
 ; CHECK-P9-NEXT:    lxv vs0, 0(r3)

diff  --git a/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
new file mode 100644
index 0000000000000..3aa601df38ffc
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/widen-vec-correctly-be.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr7 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9-BE
+define void @test() local_unnamed_addr #0 align 2 {
+; CHECK-BE-LABEL: test:
+; CHECK-BE:       # %bb.0: # %bb
+; CHECK-BE-NEXT:    vspltisw v2, -16
+; CHECK-BE-NEXT:    lhz r3, 0(r3)
+; CHECK-BE-NEXT:    xxlxor vs1, vs1, vs1
+; CHECK-BE-NEXT:    addi r3, r3, 1
+; CHECK-BE-NEXT:    vsrw v2, v2, v2
+; CHECK-BE-NEXT:    sth r3, -32(r1)
+; CHECK-BE-NEXT:    addi r3, r1, -32
+; CHECK-BE-NEXT:    lxvw4x vs0, 0, r3
+; CHECK-BE-NEXT:    addi r3, r1, -16
+; CHECK-BE-NEXT:    xxsel vs0, vs0, vs1, v2
+; CHECK-BE-NEXT:    stxvw4x vs0, 0, r3
+; CHECK-BE-NEXT:    lwz r3, -16(r1)
+; CHECK-BE-NEXT:    stw r3, 0(r3)
+; CHECK-BE-NEXT:    .p2align 4
+; CHECK-BE-NEXT:  .LBB0_1: # %bb9
+; CHECK-BE-NEXT:    #
+; CHECK-BE-NEXT:    b .LBB0_1
+;
+; CHECK-P9-BE-LABEL: test:
+; CHECK-P9-BE:       # %bb.0: # %bb
+; CHECK-P9-BE-NEXT:    lhz r3, 0(r3)
+; CHECK-P9-BE-NEXT:    vspltisw v2, -16
+; CHECK-P9-BE-NEXT:    xxlxor vs0, vs0, vs0
+; CHECK-P9-BE-NEXT:    addi r3, r3, 1
+; CHECK-P9-BE-NEXT:    vsrw v2, v2, v2
+; CHECK-P9-BE-NEXT:    sldi r3, r3, 48
+; CHECK-P9-BE-NEXT:    mtfprd f1, r3
+; CHECK-P9-BE-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-P9-BE-NEXT:    xxsldwi vs0, v2, v2, 3
+; CHECK-P9-BE-NEXT:    stfiwx f0, 0, r3
+; CHECK-P9-BE-NEXT:    .p2align 4
+; CHECK-P9-BE-NEXT:  .LBB0_1: # %bb9
+; CHECK-P9-BE-NEXT:    #
+; CHECK-P9-BE-NEXT:    b .LBB0_1
+bb:
+  br i1 false, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb
+  unreachable
+
+bb2:                                              ; preds = %bb
+  %i = load i32, ptr poison, align 4
+  %i3 = trunc i32 %i to i16
+  %i4 = add i16 %i3, 1
+  %i5 = bitcast i16 %i4 to <2 x i8>
+  %i6 = shufflevector <2 x i8> %i5, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %i7 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i8> %i6, <4 x i8> undef
+  %i8 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i8> <i8 undef, i8 undef, i8 0, i8 0>, <4 x i8> %i7
+  br label %bb9
+
+bb9:                                              ; preds = %bb9, %bb2
+  %i10 = phi <4 x i8> [ %i8, %bb2 ], [ poison, %bb9 ]
+  %i11 = bitcast <4 x i8> %i10 to i32
+  store i32 %i11, ptr poison, align 2
+  br label %bb9
+}