[clang] [Clang][RISCV] Handle RVV tuple types correctly as OutputOperands for inline asm (PR #67109)

Fri Sep 22 03:24:08 PDT 2023

https://github.com/eopXD updated https://github.com/llvm/llvm-project/pull/67109

>From dc77e5c7bca87badecc314b24cab4c10be0e02fa Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen at gmail.com>
Date: Thu, 21 Sep 2023 06:34:57 -0700
Subject: [PATCH 1/3] [Clang][RISCV] Handle RVV tuple types correctly as
 OutputOperand for inline asm

The RVV tuple type maps to an aggregate type with homogeneous scalable
vectors. EmitAsmStmt does not handle this correctly and this commit
attempts to fix it.

Expressing the type as a structure in inline asm calls will complicate
the current code base, so instead, the return type is set to be
a single scalable vector, then reconstructed with `vector.extract` and
`insertvalue`.

A follow-up commit will deal with details when associated with
InputOperands.
---
 clang/lib/CodeGen/CGStmt.cpp                  | 52 ++++++++++++++++++-
 .../rvv-inline-asm.c                          | 41 +++++++++++++++
 2 files changed, 91 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 6674aa2409a5947..755f30b9c9e6369 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Assumptions.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Intrinsics.h"
@@ -2392,6 +2393,26 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
         Tmp = Builder.CreateZExtOrTrunc(Tmp, TruncTy);
       } else if (TruncTy->isVectorTy()) {
         Tmp = Builder.CreateBitCast(Tmp, TruncTy);
+      } else if (TruncTy->isStructTy() && ResultRegQualTys[i]->isRVVType()) {
+        auto *STy = cast<llvm::StructType>(TruncTy);
+        auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+        assert(STy->containsHomogeneousScalableVectorTypes() &&
+               "Must be dealing with RVV tuple type");
+
+        unsigned MinElts = VTy->getElementCount().getKnownMinValue();
+        llvm::Value *StructValue = llvm::PoisonValue::get(STy);
+
+        for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+             Idx != TupleSize; ++Idx) {
+          llvm::Value *IdxValue =
+              llvm::ConstantInt::get(CGM.Int64Ty, Idx * MinElts);
+          llvm::Value *SubVec = Builder.CreateExtractVector(VTy, Tmp, IdxValue);
+
+          StructValue = Builder.CreateInsertValue(StructValue, SubVec, Idx);
+        }
+
+        Tmp = StructValue;
       }
     }
 
@@ -2399,7 +2420,13 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
     // ResultTypeRequiresCast elements correspond to the first
     // ResultTypeRequiresCast.size() elements of RegResults.
     if ((i < ResultTypeRequiresCast.size()) && ResultTypeRequiresCast[i]) {
-      unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
+      unsigned Size;
+      if (ResultRegQualTys[i]->isRVVType() && TruncTy->isStructTy()) {
+        Size = cast<llvm::ScalableVectorType>(
+                   cast<llvm::StructType>(TruncTy)->getElementType(0))
+                   ->getScalarSizeInBits();
+      } else
+        Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Dest.getAddress(CGF).withElementType(ResultRegTypes[i]);
       if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
         Builder.CreateStore(Tmp, A);
@@ -2524,11 +2551,32 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
       ResultRegIsFlagReg.push_back(IsFlagReg);
 
       llvm::Type *Ty = ConvertTypeForMem(QTy);
+      ResultTruncRegTypes.push_back(Ty);
+
+      // Expressing the type as a structure in inline asm calls will complicate
+      // the current code case, so instead, the return type is set to be a
+      // single scalable vector, then reconstructed with `vector.extract` and
+      // `insertvalue`. The type is derived here, and the reconstruction is done
+      // under EmitAsmStores.
+      if (QTy->isRVVType() && isa<llvm::StructType>(Ty)) {
+        // Flatten the structure into a single ScalableVectorType
+        auto *STy = cast<llvm::StructType>(Ty);
+        assert(STy->containsHomogeneousScalableVectorTypes() &&
+               isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+               "Dealing with RVV tuple (aggregate with homogeneous scalable "
+               "vectors");
+
+        auto *VecTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+        Ty = llvm::ScalableVectorType::get(VecTy->getScalarType(),
+                                           STy->getNumElements() *
+                                               VecTy->getMinNumElements());
+      }
+
       const bool RequiresCast = Info.allowsRegister() &&
           (getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
            Ty->isAggregateType());
 
-      ResultTruncRegTypes.push_back(Ty);
       ResultTypeRequiresCast.push_back(RequiresCast);
 
       if (RequiresCast) {
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
new file mode 100644
index 000000000000000..cad4f8ed5dcbd48
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -0,0 +1,41 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+#include <riscv_vector.h>
+
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve32x -disable-O0-optnone \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @foo(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> asm "#NOP", "=^vr"() #[[ATTR2:[0-9]+]], !srcloc !4
+// CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT:    ret void
+//
+void foo() {
+  vint32m1x2_t v0;
+  asm ("#NOP" : "=vr" (v0));
+}
+
+// CHECK-LABEL: define dso_local void @bar(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } asm "#NOP", "=^vr,=^vr"() #[[ATTR2]], !srcloc !5
+// CHECK-NEXT:    [[ASMRESULT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[ASMRESULT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP5]], 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 2)
+// CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]], <vscale x 2 x i32> [[TMP7]], 1
+// CHECK-NEXT:    ret void
+//
+void bar() {
+  vint32m1x2_t v0, v2;
+  asm ("#NOP" : "=vr" (v0), "=vr" (v2));
+}

>From d53a8923dfb57616ec1bef9c33d76a3ed614aeb6 Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen at gmail.com>
Date: Fri, 22 Sep 2023 02:30:58 -0700
Subject: [PATCH 2/3] [Clang][RISCV] Precommit to show current codegen for
 inline asm of RVV tuple type

The generated LLVM cannot be successfully handled because function
argument of tuple type is an aggregate of scalable vectors. It needs to
be flattened into separate arguments.
---
 .../RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
index cad4f8ed5dcbd48..8447aee1b715403 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -39,3 +39,14 @@ void bar() {
   vint32m1x2_t v0, v2;
   asm ("#NOP" : "=vr" (v0), "=vr" (v2));
 }
+
+// CHECK-LABEL: define dso_local void @baz(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr"({ <vscale x 2 x i32>, <vscale x 2 x i32> } undef) #[[ATTR3:[0-9]+]], !srcloc !6
+// CHECK-NEXT:    ret void
+//
+void baz() {
+  vint32m1x2_t v2;
+  asm ("#NOP" :: "vr" (v2));
+}

>From bce63fa3fdd0c3c0aa364e72e25da19c8b56cf3a Mon Sep 17 00:00:00 2001
From: eopXD <yueh.ting.chen at gmail.com>
Date: Fri, 22 Sep 2023 02:32:38 -0700
Subject: [PATCH 3/3] [Clang][RISCV] Handle RVV tuple types correctly as
 InputOperand for inline asm

This commit flatten the input operand from an aggregate structure into
separate arguments for RVV tuple types.
---
 clang/lib/CodeGen/CGStmt.cpp                  | 38 +++++++++++++++++--
 .../rvv-inline-asm.c                          |  4 +-
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 755f30b9c9e6369..d8707914d0c3044 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2749,10 +2749,40 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
           std::max((uint64_t)LargestVectorWidth,
                    VT->getPrimitiveSizeInBits().getKnownMinValue());
 
-    ArgTypes.push_back(Arg->getType());
-    ArgElemTypes.push_back(ArgElemType);
-    Args.push_back(Arg);
-    Constraints += InputConstraint;
+    // Expand RVV tuple type input operands.
+    if (InputExpr->getType()->isRVVType() && Arg->getType()->isStructTy()) {
+      std::string ExpandedInputContraint;
+
+      auto *STy = cast<llvm::StructType>(Arg->getType());
+
+      assert(STy->containsHomogeneousScalableVectorTypes() &&
+             isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+             "Only aggregate type of homogeneous scalable vectors is handled "
+             "here");
+
+      auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+      for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+           Idx != TupleSize; ++Idx) {
+        if (ExpandedInputContraint.size())
+          ExpandedInputContraint += ",";
+
+        ExpandedInputContraint += InputConstraint;
+        ArgTypes.push_back(VTy);
+        ArgElemTypes.push_back(ArgElemType);
+
+        llvm::Value *SubVec = Builder.CreateExtractValue(Arg, {Idx});
+
+        Args.push_back(SubVec);
+      }
+
+      Constraints += ExpandedInputContraint;
+    } else {
+      ArgTypes.push_back(Arg->getType());
+      ArgElemTypes.push_back(ArgElemType);
+      Args.push_back(Arg);
+      Constraints += InputConstraint;
+    }
   }
 
   // Append the "input" part of inout constraints.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
index 8447aee1b715403..24f403c6625d0aa 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -43,7 +43,9 @@ void bar() {
 // CHECK-LABEL: define dso_local void @baz(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr"({ <vscale x 2 x i32>, <vscale x 2 x i32> } undef) #[[ATTR3:[0-9]+]], !srcloc !6
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, 0
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } undef, 1
+// CHECK-NEXT:    call void asm sideeffect "#NOP", "^vr,^vr"(<vscale x 2 x i32> [[TMP0]], <vscale x 2 x i32> [[TMP1]]) #[[ATTR3:[0-9]+]], !srcloc !6
 // CHECK-NEXT:    ret void
 //
 void baz() {