Added new subgraph definition paradigm and revised matching logic

cehongwang · cehongwang · commit 9ee7e678a159 · 2025-11-07T23:38:30.000Z
diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
@@ -233,7 +233,8 @@ def partition_graph(self) -> torch.fx.GraphModule:
         subgraphs = self.remove_small_acc_subgraphs(subgraphs)
 
         subgraphs = self.break_subgraphs(
-            subgraphs, subgraph_size_budget=self.calculate_size_budget()
+            subgraphs,
+            subgraph_size_budget=500 * 1024 * 1024,  # self.calculate_size_budget()
         )
 
         # Set the number of TRT engines to be generated
@@ -309,6 +310,11 @@ def break_subgraphs(
         """
         This function breaks the subgraphs into smaller subgraphs to save CPU memory.
         """
+        from torch_tensorrt.dynamo.partitioning.fusion_patterns import (
+            get_node_in_fusion_pattern,
+        )
+
+        self.fusion_patterns = get_node_in_fusion_pattern(self.module.graph)
         new_subgraphs = []
         # We throw an error if the remaining memory is almost empty compared to the model size.
         # i.e. if the remaining memory is 4G (budget is 1G) the model size is greater than 40G, we stop the compilation.
@@ -328,9 +334,26 @@ def break_subgraphs(
                 new_subgraphs.append(broken_subgraphs[0])
                 subgraph = broken_subgraphs[1]
             new_subgraphs.append(subgraph)
-
+        self._varify_all_fusion_nodes_in_same_subgraph(new_subgraphs)
         return new_subgraphs
 
+    def _varify_all_fusion_nodes_in_same_subgraph(
+        self, subgraphs: List[Subgraph]
+    ) -> None:
+        node_to_subgraph = {}
+        for i, s in enumerate(subgraphs):
+            for n in s.nodes:
+                node_to_subgraph[n] = i
+
+        fusion_nodes_map_list = [
+            len({node_to_subgraph[n] for n in ns}) == 1
+            for ns in self.fusion_patterns.values()
+        ]
+        assert all(
+            fusion_nodes_map_list
+        ), "All fusion nodes must be in the same subgraph"
+        logger.info("All fusion nodes are in the same subgraph.")
+
     def break_subgraph_by_size(
         self, subgraph: Subgraph, size_to_break: int
     ) -> Tuple[List[Subgraph], int, int]:
@@ -376,9 +399,13 @@ def step_and_validate(
         while True:
             new_subgraphs = self.validate_and_correct_subgraphs(new_subgraphs)
             nodes_in_first_subgraph = set(new_subgraphs[0].nodes)
+            nodes_in_second_subgraph = set(new_subgraphs[1].nodes)
             leaf_node = self.get_leaf_node(nodes_in_first_subgraph)
             broken_fusion = self.step_if_break_fusion(
-                new_subgraphs, leaf_node, nodes_in_first_subgraph
+                new_subgraphs,
+                leaf_node,
+                nodes_in_first_subgraph,
+                nodes_in_second_subgraph,
             )
             if not broken_fusion or len(new_subgraphs[1].nodes) == 0:
                 break
@@ -390,57 +417,37 @@ def step_if_break_fusion(
         subgraphs: List[Subgraph],
         leaf_nodes: set[torch.fx.Node],
         nodes_in_first_subgraph: set[torch.fx.Node],
+        nodes_in_second_subgraph: set[torch.fx.Node],
     ) -> bool:
 
         def add_nodes(node: torch.fx.Node) -> None:
             """
             This function adds a node and all its previous nodes to the first subgraph and removes it from the second subgraph in post order.
             """
-            if node.op in CALLABLE_NODE_OPS and node not in nodes_in_first_subgraph:
+            if (
+                node.op in CALLABLE_NODE_OPS
+                and node not in nodes_in_first_subgraph
+                and node in nodes_in_second_subgraph
+            ):
+                # Exclude all nodes already in the first subgraph
                 nodes_in_first_subgraph.add(node)
+                nodes_in_second_subgraph.remove(node)
                 for input_node in node._input_nodes:
                     add_nodes(input_node)
                 subgraphs[0].nodes.append(node)
                 subgraphs[1].nodes.remove(node)
 
-        def match_subgraph_and_step(node: torch.fx.Node) -> bool:
-            added_nodes = False
-            for op_list in NON_BREAKABLE_OP_LISTS:
-                for i, op in enumerate(op_list):
-                    if i != len(op_list) - 1 and op in str(node.target):
-                        # Search following ops forward using BFS. We skip search previous ops because
-                        # even if it's just a subset of fusion graph, we still want it to be fused.
-
-                        users = node.users.keys()
-                        matching_nodes: set[torch.fx.Node] = set()
-                        for following_op_idx in range(i + 1, len(op_list)):
-                            matching_nodes = set()
-                            for user in users:
-                                if op_list[following_op_idx] in str(user.target):
-                                    matching_nodes.add(user)
-                            if not matching_nodes:
-                                break
-                            users = set()
-                            for matching_node in matching_nodes:
-                                for next_user in matching_node.users:
-                                    users.add(next_user)
-
-                        for matching_node in matching_nodes:
-                            added_nodes = True
-                            add_nodes(matching_node)
-
-                        if added_nodes:
-                            # Early terminate the search if we have found a match because preceeding matches can cover following matches
-                            break
-
-            return True if added_nodes else False
-
-        found_match = False
+        fusion_broken = False
         for leaf in leaf_nodes:
-            if match_subgraph_and_step(leaf):
-                found_match = True
+            for node in self.fusion_patterns.get(leaf, []):
+                if (
+                    node not in nodes_in_first_subgraph
+                    and node in nodes_in_second_subgraph
+                ):
+                    fusion_broken = True
+                    add_nodes(node)
 
-        return found_match
+        return fusion_broken
 
     def get_leaf_node(
         self, nodes_in_first_subgraph: set[torch.fx.Node]
diff --git a/py/torch_tensorrt/dynamo/partitioning/fusion_patterns.py b/py/torch_tensorrt/dynamo/partitioning/fusion_patterns.py
@@ -0,0 +1,183 @@
+from typing import Dict, List, Set
+
+import torch
+from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
+from torch.ops import aten
+
+
+class ConvBNReLU(torch.nn.Module):  # type: ignore[misc]
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        transposed: bool,
+        output_padding: List[int],
+        groups: int,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        running_mean: torch.Tensor,
+        running_var: torch.Tensor,
+        momentum: float,
+        eps: float,
+    ) -> torch.Tensor:
+        x = aten.convolution.default(
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        )
+        x = aten._native_batch_norm_legit_no_training.default(
+            x, bn_weight, bn_bias, running_mean, running_var, momentum, eps
+        )[0]
+        x = aten.relu.default(x)
+        return x
+
+
+class ConvReLU(torch.nn.Module):  # type: ignore[misc]
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        transposed: bool,
+        output_padding: List[int],
+        groups: int,
+    ) -> torch.Tensor:
+        x = aten.convolution.default(
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        )
+        x = aten.relu.default(x)
+        return x
+
+
+class ConvGelu(torch.nn.Module):  # type: ignore[misc]
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        transposed: bool,
+        output_padding: List[int],
+        groups: int,
+    ) -> torch.Tensor:
+        x = aten.convolution.default(
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+        )
+        x = aten.gelu.default(x)
+        return x
+
+
+class ConvSilu(torch.nn.Module):  # type: ignore[misc]
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+    ) -> torch.Tensor:
+        x = aten.convolution.default(
+            x, weight, bias, [1, 1], [1, 1], [1, 1], False, [0, 0], 1
+        )
+        x = aten.silu.default(x)
+        return x
+
+
+class MulAdd(torch.nn.Module):  # type: ignore[misc]
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+    ) -> torch.Tensor:
+        x = aten.mul.Tensor(x, weight)
+        x = aten.add.Tensor(x, bias)
+        return x
+
+
+class MulMul(torch.nn.Module):  # type: ignore[misc]
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+    ) -> torch.Tensor:
+        x = aten.mul.Tensor(x, y)
+        x = aten.mul.Tensor(x, z)
+        return x
+
+
+All_FUSION_PATTERNS = [
+    ConvBNReLU,
+    ConvReLU,
+    ConvGelu,
+    ConvSilu,
+    MulAdd,
+    MulMul,
+]
+
+
+def get_node_in_fusion_pattern(
+    graph: torch.fx.Graph,
+) -> Dict[torch.fx.Node, Set[torch.fx.Node]]:
+    """
+    This function gets the nodes map of the fusion pattern from the graph.
+    Key: node that appears in the fusion pattern
+    Value: the list of nodes that should be fused together
+    """
+    fusion_nodes = {}
+    for pattern in All_FUSION_PATTERNS:
+        pattern_graph = torch.fx.symbolic_trace(pattern())
+        subgraph_matcher = SubgraphMatcher(pattern_graph.graph)
+        match_result = subgraph_matcher.match(graph)
+        for match in match_result:
+            fusion_group = {
+                node
+                for node in match.nodes_map.values()
+                if node
+                and type(node) == torch.fx.Node
+                and node.op == "call_function"
+                and node not in match.placeholder_nodes
+            }
+            for node in fusion_group:
+                fusion_nodes[node] = fusion_group
+
+    return fusion_nodes
diff --git a/py/torch_tensorrt/dynamo/partitioning/fusion_subgraphs.py b/py/torch_tensorrt/dynamo/partitioning/fusion_subgraphs.py