Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/fusilli.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

// Nodes:
#include "fusilli/node/conv_node.h" // IWYU pragma: export
#include "fusilli/node/layernorm_node.h" // IWYU pragma: export
#include "fusilli/node/matmul_node.h" // IWYU pragma: export
#include "fusilli/node/node.h" // IWYU pragma: export
#include "fusilli/node/pointwise_node.h" // IWYU pragma: export
Expand Down
51 changes: 51 additions & 0 deletions include/fusilli/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
#ifndef FUSILLI_GRAPH_GRAPH_H
#define FUSILLI_GRAPH_GRAPH_H

#include "fusilli/attributes/common.h"
#include "fusilli/attributes/conv_attributes.h"
#include "fusilli/attributes/layernorm_attributes.h"
#include "fusilli/attributes/matmul_attributes.h"
#include "fusilli/attributes/pointwise_attributes.h"
#include "fusilli/attributes/tensor_attributes.h"
Expand All @@ -24,6 +26,7 @@
#include "fusilli/backend/handle.h"
#include "fusilli/graph/context.h"
#include "fusilli/node/conv_node.h"
#include "fusilli/node/layernorm_node.h"
#include "fusilli/node/matmul_node.h"
#include "fusilli/node/node.h"
#include "fusilli/node/pointwise_node.h"
Expand Down Expand Up @@ -233,6 +236,10 @@ class Graph : public INode {
std::shared_ptr<TensorAttr> convDGrad(const std::shared_ptr<TensorAttr> &dy,
const std::shared_ptr<TensorAttr> &w,
ConvDGradAttr &attributes);
std::array<std::shared_ptr<TensorAttr>, 3>
layernorm(const std::shared_ptr<TensorAttr> &x,
const std::shared_ptr<TensorAttr> &scale,
const std::shared_ptr<TensorAttr> &bias, LayernormAttr &attributes);
std::shared_ptr<TensorAttr> matmul(const std::shared_ptr<TensorAttr> &a,
const std::shared_ptr<TensorAttr> &b,
MatmulAttr &attributes);
Expand Down Expand Up @@ -641,6 +648,50 @@ Graph::convDGrad(const std::shared_ptr<TensorAttr> &dy,
return dx;
}

// Create a LayernormNode, populate it with the specified attributes, create
// output tensors and add the node to the graph's sub nodes
inline std::array<std::shared_ptr<TensorAttr>, 3>
Graph::layernorm(const std::shared_ptr<TensorAttr> &x,
const std::shared_ptr<TensorAttr> &scale,
const std::shared_ptr<TensorAttr> &bias,
LayernormAttr &layernormAttr) {
// Populate names when not set.
if (layernormAttr.getName().empty())
layernormAttr.setName("layernorm_" + std::to_string(subNodes_.size()));
if (x && x->getName().empty())
x->setName(layernormAttr.getName() + "_X");
if (scale && scale->getName().empty())
scale->setName(layernormAttr.getName() + "_SCALE");
if (bias && bias->getName().empty())
bias->setName(layernormAttr.getName() + "_BIAS");

FUSILLI_LOG_LABEL_ENDL("INFO: Adding Layernorm '" << layernormAttr.getName()
<< "' to Graph");

// Set inputs.
layernormAttr.setX(x);
layernormAttr.setSCALE(scale);
layernormAttr.setBIAS(bias);

// Set outputs.
std::shared_ptr<TensorAttr> y = outputTensor(layernormAttr.getName() + "_Y");
std::shared_ptr<TensorAttr> m = nullptr;
std::shared_ptr<TensorAttr> v = nullptr;
if (layernormAttr.getForwardPhase() == NormFwdPhase::TRAINING) {
m = outputTensor(layernormAttr.getName() + "_MEAN");
v = outputTensor(layernormAttr.getName() + "_INV_VARIANCE");
}
layernormAttr.setY(y);
layernormAttr.setMEAN(m);
layernormAttr.setINV_VARIANCE(v);

// Create node and add to Graph's subNodes_.
subNodes_.emplace_back(
std::make_unique<LayernormNode>(std::move(layernormAttr), context));

return {y, m, v};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

Suggested change
return {y, m, v};
return {std::move(y), std::move(m), std::move(v)};

}

// Create a MatmulNode, populate it with the specified attributes, create
// output tensors and add the node to the graph's sub nodes.
inline std::shared_ptr<TensorAttr>
Expand Down
263 changes: 263 additions & 0 deletions include/fusilli/node/layernorm_node.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
// Copyright 2025 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//
//
// This file contains definitions for the layer normalization node
// `LayernormNode`.
//
//===----------------------------------------------------------------------===//

#ifndef FUSILLI_NODE_LAYERNORM_NODE_H
#define FUSILLI_NODE_LAYERNORM_NODE_H

#include "fusilli/attributes/common.h"
#include "fusilli/attributes/layernorm_attributes.h"
#include "fusilli/attributes/tensor_attributes.h"
#include "fusilli/graph/context.h"
#include "fusilli/node/node.h"
#include "fusilli/support/logging.h"

#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
#include <vector>

namespace fusilli {

//===----------------------------------------------------------------------===//
// Layer normalization node.
//===----------------------------------------------------------------------===//

class LayernormNode : public NodeCRTP<LayernormNode> {
public:
LayernormAttr layernormAttr;

LayernormNode(LayernormAttr &&attr, const Context &ctx)
: NodeCRTP(ctx), layernormAttr(std::move(attr)) {}

const std::string &getName() const override final {
return layernormAttr.getName();
}
Type getType() const override final { return Type::Layernorm; }

ErrorObject preValidateNode() const override final {
FUSILLI_LOG_LABEL_ENDL("INFO: Pre-Validating LayernormNode '"
<< layernormAttr.getName() << "'");

FUSILLI_RETURN_ERROR_IF(
layernormAttr.getForwardPhase() == NormFwdPhase::NOT_SET,
ErrorCode::AttributeNotSet, "Layernorm forward phase not set");

std::shared_ptr<TensorAttr> xT = layernormAttr.getX();
std::shared_ptr<TensorAttr> sT = layernormAttr.getSCALE();
std::shared_ptr<TensorAttr> bT = layernormAttr.getBIAS();
std::shared_ptr<TensorAttr> eT = layernormAttr.getEPSILON();
std::shared_ptr<TensorAttr> yT = layernormAttr.getY();
std::shared_ptr<TensorAttr> mT = layernormAttr.getMEAN();
std::shared_ptr<TensorAttr> vT = layernormAttr.getINV_VARIANCE();
Comment on lines +57 to +63
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should be references, but I think this mistake is made throughout the codebase.


// Ensure mandatory input and output tensors are set.
FUSILLI_RETURN_ERROR_IF(!xT, ErrorCode::AttributeNotSet,
"Layernorm input tensor X not set");
FUSILLI_RETURN_ERROR_IF(!eT, ErrorCode::AttributeNotSet,
"Layernorm input tensor EPSILON not set");
Comment on lines +68 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it expected that the user will set this epsilon manually, since it doesn't show up in the signature for Graph::layernorm()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is epsilon set in cudnn since it doesn't show up on the node API there either? Is it hardcoded? Alexandra had it exposed in the node API but I asked her to remove to be consistent with cudnn since this is user facing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was looking into this a bit more. Here is a sample in cudnn that uses it. I think its considered to be part of the attr vs being data like normal tensors. Both cudnn and Alexandra's PR #71 don't pass epsilon through the variant pack. It is a compile-time constant.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense. It looks like we need a setter (setEpsilon) on the LayernormAttr and feed that with the output of graph.tensor(eps_float) like in the cudnn sample:

        float epsilon_cpu = 1e-05f;
        auto epsilon      = graph.tensor(epsilon_cpu);

        auto layernorm_options =
            fe::graph::Layernorm_attributes()
                .set_forward_phase(train ? fe::NormFwdPhase_t::TRAINING : fe::NormFwdPhase_t::INFERENCE)
                .set_epsilon(epsilon);
        auto [Y, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
        ```

FUSILLI_RETURN_ERROR_IF(!yT, ErrorCode::AttributeNotSet,
"Layernorm output tensor Y not set");

// Shape and layout checks on input tensor.
size_t xRank = xT->getDim().size();
FUSILLI_RETURN_ERROR_IF(
xRank < 2, ErrorCode::InvalidAttribute,
"Layernorm input tensor X must have a rank of at least 2");
FUSILLI_RETURN_ERROR_IF(!xT->isContiguous() && !xT->isChannelsLast(),
ErrorCode::NotImplemented,
"Tensor '" + xT->getName() +
"' is neither contiguous nor channels-last as "
"defined by its stride");

// Shape and layout checks on scale tensor.
if (sT) {
std::vector<int64_t> expectedDim = xT->getDim();
expectedDim[0] = 1;
FUSILLI_RETURN_ERROR_IF(sT->getDim() != expectedDim,
ErrorCode::InvalidAttribute,
"Layernorm input tensor SCALE must have shape as "
"tensor X with single batch");

FUSILLI_RETURN_ERROR_IF(
!sT->isContiguous() && !sT->isChannelsLast(),
ErrorCode::NotImplemented,
"Tensor '" + sT->getName() +
"' is neither contiguous nor channels-last as "
"defined by its stride");
}

// Shape and layout checks on bias tensor.
if (bT) {
std::vector<int64_t> expectedDim = xT->getDim();
expectedDim[0] = 1;
FUSILLI_RETURN_ERROR_IF(bT->getDim() != expectedDim,
ErrorCode::InvalidAttribute,
"Layernorm input tensor BIAS must have shape as "
"tensor X with single batch");
FUSILLI_RETURN_ERROR_IF(
!bT->isContiguous() && !bT->isChannelsLast(),
ErrorCode::NotImplemented,
"Tensor '" + bT->getName() +
"' is neither contiguous nor channels-last as "
"defined by its stride");
}

// Epsilon should be set and be constant scalar.
FUSILLI_RETURN_ERROR_IF(
!eT->isScalar(), ErrorCode::InvalidAttribute,
"Layernorm input tensor EPSILON must be a constant scalar");

// Output tensor checks for training and inference forward phases.
if (isTrainingForwardPhase()) {
FUSILLI_RETURN_ERROR_IF(!mT, ErrorCode::AttributeNotSet,
"Layernorm output tensor MEAN not set");
FUSILLI_RETURN_ERROR_IF(!vT, ErrorCode::AttributeNotSet,
"Layernorm output tensor INV_VARIANCE not set");
} else {
FUSILLI_RETURN_ERROR_IF(mT, ErrorCode::InvalidAttribute,
"Layernorm output tensor MEAN should not be set");
FUSILLI_RETURN_ERROR_IF(
vT, ErrorCode::InvalidAttribute,
"Layernorm output tensor INV_VARIANCE should not be set");
}

return ok();
}

ErrorObject inferPropertiesNode() override final {
FUSILLI_LOG_LABEL_ENDL("INFO: Inferring properties for LayernormNode '"
<< layernormAttr.getName() << "'");

layernormAttr.fillFromContext(context);

std::shared_ptr<TensorAttr> xT = layernormAttr.getX();
std::shared_ptr<TensorAttr> yT = layernormAttr.getY();

const std::vector<int64_t> &xDim = xT->getDim();

// Infer shape of output Y tensor.
if (yT->getDim().empty()) {
yT->setDim(xDim);
}

// Infer stride of output Y tensor.
if (yT->getStride().empty()) {
// When unspecified, preserve the stride order of xT (input tensor).
yT->setStride(xT->getStride());
}

if (isTrainingForwardPhase()) {
const auto &[dim, stride] = getTrainingForwardOutputDimAndStride(xDim);

std::shared_ptr<TensorAttr> mT = layernormAttr.getMEAN();
std::shared_ptr<TensorAttr> vT = layernormAttr.getINV_VARIANCE();

// Infer shape of output MEAN tensor.
if (mT->getDim().empty()) {
mT->setDim(dim);
}
// Infer shape of output INV_VARIANCE tensor.
if (vT->getDim().empty()) {
vT->setDim(dim);
}
// Infer stride of output MEAN tensor.
if (mT->getStride().empty()) {
mT->setStride(stride);
}
// Infer stride of output INV_VARIANCE tensor.
if (vT->getStride().empty()) {
vT->setStride(stride);
}
}

return ok();
}

ErrorObject postValidateNode() const override final {
FUSILLI_LOG_LABEL_ENDL("INFO: Post-Validating LayernormNode '"
<< layernormAttr.getName() << "'");

std::shared_ptr<TensorAttr> xT = layernormAttr.getX();
std::shared_ptr<TensorAttr> yT = layernormAttr.getY();

const std::vector<int64_t> &xDim = xT->getDim();

// Shape check for output Y tensor.
FUSILLI_RETURN_ERROR_IF(
xDim != yT->getDim(), ErrorCode::InvalidAttribute,
"Layernorm output Y tensor must have the same shape as input X tensor");

// Layout check for output Y tensor.
FUSILLI_RETURN_ERROR_IF(!yT->isContiguous() && !yT->isChannelsLast(),
ErrorCode::NotImplemented,
"Tensor '" + yT->getName() +
"' is neither contiguous nor channels-last as "
"defined by its stride");

if (isTrainingForwardPhase()) {
const auto &[dim, stride] = getTrainingForwardOutputDimAndStride(xDim);

std::shared_ptr<TensorAttr> mT = layernormAttr.getMEAN();
std::shared_ptr<TensorAttr> vT = layernormAttr.getINV_VARIANCE();

// Shape check for output MEAN tensor
FUSILLI_RETURN_ERROR_IF(
dim != mT->getDim(), ErrorCode::InvalidAttribute,
"Layernorm output MEAN tensor must have shape [B, 1, ..., 1] with "
"rank equal to shape rank of input X tensor and batch dimension "
"equal to "
"input X tensor batch dimension");
// Shape check for output INV_VARIANCE tensor
FUSILLI_RETURN_ERROR_IF(dim != vT->getDim(), ErrorCode::InvalidAttribute,
"Layernorm output INV_VARIANCE tensor must have "
"shape [B, 1, ..., 1] with "
"rank equal to shape rank of input X tensor and "
"batch dimension equal to "
"input X tensor batch dimension");
// Stride check for output MEAN tensor
FUSILLI_RETURN_ERROR_IF(
stride != mT->getStride(), ErrorCode::InvalidAttribute,
"Layernorm output MEAN tensor must have unit strides");
// Stride check for output INV_VARIANCE tensor
FUSILLI_RETURN_ERROR_IF(
stride != vT->getStride(), ErrorCode::InvalidAttribute,
"Layernorm output INV_VARIANCE tensor must have unit strides");
Comment on lines +229 to +236
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't all strides valid because these are both just scalars?

}

return ok();
}

private:
inline bool isTrainingForwardPhase() const {
return layernormAttr.getForwardPhase() == NormFwdPhase::TRAINING;
}

std::pair<std::vector<int64_t>, std::vector<int64_t>>
getTrainingForwardOutputDimAndStride(const std::vector<int64_t> &xDim) const {
// The MEAN and INV_VARIANCE tensors have shape [B, 1, ..., 1]
std::vector<int64_t> dim(xDim.size(), 1);
dim[0] = xDim[0];

// Since MEAN and INV_VARIANCE tensors have shape [B, 1, ..., 1],
// strides are always equal to [1, 1, ..., 1] for both contiguous and
// channels-last layouts.
std::vector<int64_t> stride(dim.size(), 1);
return {dim, stride};
}
};

} // namespace fusilli

#endif // FUSILLI_NODE_LAYERNORM_NODE_H
1 change: 1 addition & 0 deletions include/fusilli/node/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class INode {
Pointwise,
WGrad,
DGrad,
Layernorm,
Matmul,
};

Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ add_fusilli_tests(
PREFIX fusilli_node_tests
SRCS
test_conv_node.cpp
test_layernorm_node.cpp
test_matmul_node.cpp
test_pointwise_node.cpp
DEPS
Expand Down
Loading
Loading