diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
new file mode 100644
index 0000000000..5bd4f76209
--- /dev/null
+++ b/configs/accl/async-pr.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        r_queue_size,
+        r_latency,
+        graph,
+        alpha,
+        threshold,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple_pt2pt import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, 
+                                        r_queue_size, r_latency, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_async_pr_workload(alpha, threshold)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/bc.py b/configs/accl/bc.py
new file mode 100644
index 0000000000..9a0bf298b5
--- /dev/null
+++ b/configs/accl/bc.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.sample_time,
+        args.tokens,
+        args.simple,
+        args.pt2pt,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
+        graph,
+        init_addr,
+        init_value,
+        sample_time,
+        tokens,
+        simple,
+        pt2pt,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
+    else:
+        from sega import SEGA
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_bc_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iterations = 0
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iterations += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iterations}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
new file mode 100644
index 0000000000..c2150ce751
--- /dev/null
+++ b/configs/accl/bfs.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
+    argparser.add_argument(
+        "--tile",
+        dest="tile",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use temporal partitioning",
+    )
+    argparser.add_argument(
+        "--best",
+        dest="best",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use best update value for switching slices",
+    )
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.sample_time,
+        args.tokens,
+        args.tile,
+        args.best,
+        args.visited,
+        args.simple,
+        args.pt2pt,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
+        graph,
+        init_addr,
+        init_value,
+        sample_time,
+        tokens,
+        tile,
+        best,
+        visited,
+        simple,
+        pt2pt,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
+    else:
+        from sega import SEGA
+
+    system = SEGA(num_gpts, num_registers, cache_size,
+                                                r_queue_size, r_latency, graph)
+
+    root = Root(full_system=False, system=system)
+    m5.instantiate()
+    if tile:
+        system.set_pg_mode()
+    else:
+        system.set_async_mode()
+
+    system.create_pop_count_directory(64)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(50000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "simulate() limit reached":
+                m5.stats.dump()
+                m5.stats.reset()
+            elif exit_event.getCause() == "Done with all the slices.":
+                break
+            elif exit_event.getCause() == "no update left to process.":
+                break
+    else:
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "Done with all the slices.":
+                break
+            if exit_event.getCause() == "no update left to process.":
+                break
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/cc.py b/configs/accl/cc.py
new file mode 100644
index 0000000000..03b3d04d46
--- /dev/null
+++ b/configs/accl/cc.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
+        args.graph,
+        args.sample_time,
+        args.tokens,
+        args.simple,
+        args.pt2pt,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
+        graph,
+        sample_time,
+        tokens,
+        simple,
+        pt2pt,
+        sample,
+        verify,
+    ) = get_inputs()
+    print(sample_time)
+    if simple:
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
+    else:
+        from sega import SEGA
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_cc_workload()
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
new file mode 100644
index 0000000000..7ef6587ab3
--- /dev/null
+++ b/configs/accl/pr.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
+    argparser.add_argument("--num_nodes", type=int, default=1)
+    argparser.add_argument("--error_threshold", type=float, default=0.0)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
+        args.graph,
+        args.iterations,
+        args.alpha,
+        args.num_nodes,
+        args.error_threshold,
+        args.sample_time,
+        args.tokens,
+        args.simple,
+        args.pt2pt,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
+        graph,
+        iterations,
+        alpha,
+        num_nodes,
+        error_threshold,
+        sample_time,
+        tokens,
+        simple,
+        pt2pt,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    print(f"error_threshold: {error_threshold}")
+
+    if simple:
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                    r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
+    else:
+        from sega import SEGA
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_pr_workload(num_nodes, alpha)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            print(f"error: {system.get_pr_error()}")
+            if system.get_pr_error() < error_threshold:
+                break
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
new file mode 100644
index 0000000000..17d84bd86c
--- /dev/null
+++ b/configs/accl/sega.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(),
+            dram_2=HBM_2000_4H_1x64(),
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="16GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("4GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
new file mode 100644
index 0000000000..08f0f181ba
--- /dev/null
+++ b/configs/accl/sega_simple.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="256GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="16GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size,
+                                        r_queue_size, r_latency, graph_path):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("4GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        self.ctrl.router_vector = []
+
+        # self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+        # self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        # self.ctrl.router_vector = []
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
\ No newline at end of file
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
new file mode 100644
index 0000000000..5b7309d44f
--- /dev/null
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="256GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="16GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+    
+    def set_router_vector(self, router_vector):
+        self.controller.router_vector = router_vector
+
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size,
+                            r_queue_size, r_latency, gpt_per_gpn, graph_path, sample_time, tokens):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+        
+        gpts = []
+        routers = []
+        GPTPerGPN = gpt_per_gpn
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+       
+        # Creating the interconnect among mpus
+        # for gpt_0 in gpts:
+        #     for gpt_1 in gpts:
+        #         gpt_0.setReqPort(gpt_1.getRespPort())
+
+        for i in range(int(num_gpts/GPTPerGPN)):
+            routers.append(RouterEngine(
+                                gpn_queue_size = r_queue_size,
+                                gpt_queue_size = r_queue_size,
+                                router_latency = r_latency))
+
+        self.routers = routers
+        # for gpt_0 in gpts:
+        #     for gpt_1 in gpts:
+        #         gpt_0.setReqPort(gpt_1.getRespPort())
+        print("gpt, gpt")
+        for i in range(len(gpts)):
+            for j in range(len(gpts)):
+                if (int(i / GPTPerGPN) == int(j / GPTPerGPN) ):
+                    # print(i, j)
+                    gpts[i].setReqPort(gpts[j].getRespPort())
+        # print("gpt, Router")
+        for i in range(len(gpts)):
+            for j in range(len(routers)):
+                if (int(i / GPTPerGPN) == j):
+                    # print(i, j)
+                    gpts[i].setRespPort(routers[j].gpt_req_side)
+                    gpts[i].setReqPort(routers[j].gpt_resp_side)
+        # print("router, router")
+        for r_0 in routers:
+            for r_1 in routers:
+                if r_0 != r_1:
+                    # print(r_0, r_1)
+                    r_0.gpn_resp_side = r_1.gpn_req_side 
+        self.gpts = gpts
+        self.routers = routers
+        
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+        self.ctrl.set_router_vector([r for r in self.routers])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py
new file mode 100644
index 0000000000..08581bbb81
--- /dev/null
+++ b/configs/accl/sssp.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.sample_time,
+        args.tokens,
+        args.simple,
+        args.pt2pt,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
+        graph,
+        init_addr,
+        init_value,
+        sample_time,
+        tokens,
+        simple,
+        pt2pt,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
+    else:
+        from sega import SEGA
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_sssp_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
new file mode 100644
index 0000000000..ebfca7e794
--- /dev/null
+++ b/src/accl/graph/TODO.md
@@ -0,0 +1,8 @@
+# TODO Items
+
+* We might need to revisit the fact that we could insert something to a queue on
+    the same cycle that another event is consuming something from the queue.
+* Move checking for wl.degree == 0 to coalesce engine.
+* Fix the retry system between memory queue and coalesce engine
+* Update inheritance: There is not enough reason for PushEngine and
+CoalesceEngine to be of the same type (i.e. delete BaseMemEngine).
diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py
new file mode 100644
index 0000000000..0585c36e48
--- /dev/null
+++ b/src/accl/graph/base/BaseReduceEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReduceEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReduceEngine'
+    cxx_header = "accl/graph/base/base_reduce_engine.hh"
+    cxx_class = 'gem5::BaseReduceEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
new file mode 100644
index 0000000000..35111c34d2
--- /dev/null
+++ b/src/accl/graph/base/SConscript
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
+
+Source("base_reduce_engine.cc")
+Source("graph_workload.cc")
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
new file mode 100644
index 0000000000..ade95800d2
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/base_reduce_engine.hh"
+
+namespace gem5
+{
+
+BaseReduceEngine::BaseReduceEngine(const Params &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this))
+{}
+
+BaseReduceEngine::~BaseReduceEngine()
+{}
+
+}
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
new file mode 100644
index 0000000000..268bb60b76
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+
+#include "params/BaseReduceEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReduceEngine : public ClockedObject
+{
+  private:
+    System* system;
+
+  protected:
+
+    const RequestorID _requestorId;
+
+  public:
+    PARAMS(BaseReduceEngine);
+    BaseReduceEngine(const Params &params);
+    ~BaseReduceEngine();
+
+    RequestorID requestorId() { return _requestorId; }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
new file mode 100644
index 0000000000..f1a26f6ac2
--- /dev/null
+++ b/src/accl/graph/base/data_structs.hh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <deque>
+
+namespace gem5
+{
+
+struct __attribute__ ((packed)) WorkListItem
+{
+    uint32_t tempProp : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeFuture: 1;
+
+    std::string to_string()
+    {
+        return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeFuture: %s}",
+                        tempProp, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeFuture ? "true" : "false");
+    }
+
+    WorkListItem():
+        tempProp(0),
+        prop(0),
+        edgeIndex(0),
+        degree(0),
+        activeNow(false),
+        activeFuture(false)
+    {}
+
+    WorkListItem(uint32_t temp_prop, uint32_t prop,
+                uint32_t degree, uint32_t edge_index,
+                bool active_now, bool active_future):
+        tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree),
+        activeNow(active_now), activeFuture(active_future)
+    {}
+
+};
+
+struct __attribute__ ((packed)) Edge
+{
+    uint16_t weight : 16;
+    uint64_t neighbor : 48;
+
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    }
+
+    Edge(): weight(0), neighbor(0) {}
+
+    Edge(uint16_t weight, uint64_t neighbor):
+        weight(weight),
+        neighbor(neighbor)
+    {}
+};
+
+struct __attribute__ ((packed)) MirrorVertex
+{
+    uint32_t vertexId : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeNext: 1;
+
+    std::string to_string()
+    {
+        return csprintf("MirrorVertex{vertexId: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeNext: %s}",
+                        vertexId, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeNext ? "true" : "false");
+    }
+    MirrorVertex():
+        vertexId(-1),
+        prop(-1),
+        edgeIndex(-1),
+        degree(-1),
+        activeNow(false),
+        activeNext(false)
+    {}
+
+    MirrorVertex(uint32_t vertex_id, uint32_t prop, uint32_t degree,
+                uint32_t edge_index, bool active_now, bool active_next):
+                vertexId(vertex_id), prop(prop), edgeIndex(edge_index),
+                degree(degree), activeNow(active_now), activeNext(active_next)
+    {}
+
+};
+
+static_assert(isPowerOf2(sizeof(WorkListItem)));
+static_assert(isPowerOf2(sizeof(Edge)));
+static_assert(isPowerOf2(sizeof(MirrorVertex)));
+
+struct MetaEdge {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t weight;
+    uint32_t value;
+
+    MetaEdge(): src(0), dst(0), weight(0), value(0)
+    {}
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}",
+                                                    src, dst, weight, value);
+    }
+};
+
+struct Update {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t value;
+
+    Update(): src(0), dst(0), value(0)
+    {}
+    Update(uint64_t src, uint64_t dst, uint32_t value):
+        src(src), dst(dst), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("Update{src: %lu, dst:%lu, value: %u}",
+                                                src, dst, value);
+    }
+};
+
+template<typename T>
+class UniqueFIFO
+{
+  private:
+    int cap;
+    int pop;
+
+    int* added;
+    int* deleted;
+    std::deque<T> container;
+
+  public:
+    UniqueFIFO() {
+        cap = 0;
+        pop = 0;
+        added = nullptr;
+        deleted = nullptr;
+    }
+
+    UniqueFIFO(int size) {
+        cap = size;
+        pop = 0;
+
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
+    }
+
+    ~UniqueFIFO() {
+        delete [] added;
+        delete [] deleted;
+    }
+
+    void fix_front() {
+        while(true) {
+            T elem = container.front();
+            if (deleted[elem] > 0) {
+                deleted[elem]--;
+                added[elem]--;
+                container.pop_front();
+            } else {
+                assert(deleted[elem] == 0);
+                assert(added[elem] == 1);
+                break;
+            }
+        }
+    }
+
+    T front() {
+        fix_front();
+        return container.front();
+    }
+
+    size_t size() {
+        return pop;
+    }
+
+    void clear() {
+        pop = 0;
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
+    }
+
+    bool empty() {
+        return size() == 0;
+    }
+
+    bool find(T item) {
+        assert(added[item] >= 0);
+        assert(deleted[item] >= 0);
+        int diff = added[item] - deleted[item];
+        assert((diff == 0) || (diff == 1));
+        return (diff == 1);
+    }
+
+    void push_back(T item) {
+        if (!find(item)) {
+            added[item]++;
+            pop++;
+            container.push_back(item);
+        }
+    }
+
+    void pop_front() {
+        T elem = front();
+        assert(added[elem] == 1);
+        added[elem] = 0;
+        pop--;
+        container.pop_front();
+    }
+
+    void erase(T item) {
+        assert(find(item));
+        deleted[item]++;
+        pop--;
+    }
+
+    void operator=(const UniqueFIFO<T>& rhs) {
+        cap = rhs.cap;
+        pop = rhs.pop;
+        container = rhs.container;
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+        std::memcpy(added, rhs.added, cap * sizeof(int));
+        std::memcpy(deleted, rhs.deleted, cap * sizeof(int));
+    }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
new file mode 100644
index 0000000000..fd802cf275
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.cc
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/graph_workload.hh"
+
+#include <cstring>
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
+namespace gem5
+{
+
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
+void
+BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = initValue;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BFSWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + 1;
+}
+
+bool
+BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0);
+}
+
+uint32_t
+BFSWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+BFSWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+uint32_t
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
+    return value;
+}
+
+void
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    Addr pkt_addr = pkt->getAddr();
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+SSSPWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + weight;
+}
+
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int num_elements = pkt->getSize() / sizeof(WorkListItem);
+    WorkListItem items[num_elements];
+    pkt->writeDataToBlock((uint8_t*) items, pkt->getSize());
+
+    bool atom_active = false;
+    for (int index = 0; index < num_elements; index++) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = readFromFloat<uint32_t>(0);
+        new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
+        new_wl.activeNow = activeCondition(new_wl, items[index]);
+        atom_active |= new_wl.activeNow;
+        items[index] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt->getSize());
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    float temp_float = writeToFloat<uint32_t>(new_wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(new_wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return (dist >= threshold) && (new_wl.degree > 0);
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    wl.prop = wl.tempProp;
+    return readFromFloat<uint32_t>(delta);
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+void
+BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = readFromFloat<uint32_t>((1 - alpha)/numNodes);
+        new_wl.prop = readFromFloat<uint32_t>(1/numNodes);
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+BSPPRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+BSPPRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(alpha * value_float);
+}
+
+bool
+BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    return (old_wl.degree > 0);
+}
+
+uint32_t
+BSPPRWorkload::apply(WorkListItem& wl)
+{
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = prop_float / wl.degree;
+    uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+    return delta_uint;
+}
+
+void
+BSPPRWorkload::interIterationInit(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    error += std::abs(temp_float - prop_float);
+    wl.prop = wl.tempProp;
+    wl.tempProp = readFromFloat<uint32_t>((1 - alpha) / numNodes);
+    wl.activeFuture = (wl.degree > 0);
+}
+
+std::string
+BSPPRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+void
+BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int pkt_size = pkt->getSize();
+    int aligned_addr = roundDown<uint32_t, size_t>(initAddr, pkt_size);
+
+    if (aligned_addr == pkt->getAddr()) {
+        int num_elements = pkt_size / sizeof(WorkListItem);
+        WorkListItem items[num_elements];
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (initAddr - aligned_addr) / sizeof(WorkListItem);
+        WorkListItem new_wl = items[index];
+        uint32_t prop = 0;
+        prop |= initValue;
+        // NOTE: Depth of the initial vertex is 0.
+        prop &= countMask;
+        new_wl.tempProp = prop;
+        new_wl.prop = prop;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BSPBCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t update_depth = (update & depthMask) >> 24;
+    uint32_t update_count = (update & countMask);
+    uint32_t value_depth = (value & depthMask) >> 24;
+    uint32_t value_count = (value & countMask);
+    if (value_depth == 255) {
+        value_depth = currentDepth;
+        value_count = 0;
+    }
+    if (value_depth == currentDepth) {
+        value_count += update_count;
+    }
+    uint32_t ret = 0;
+    ret |= value_count;
+    warn_if(value_count > 16777215, "value count has grown bigger than 16777125."
+                                " This means the algorithm result might not be correct."
+                                " However, the traversal will not be affected."
+                                " Therefore, performane metrics could be used.");
+    // HACK: Make sure to always set the depth correctly even if count
+    // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
+    ret &= countMask;
+    // NOTE: Now that the depth is securely reset we can copy the correct value.
+    ret |= (value_depth << 24);
+    return ret;
+}
+
+uint32_t
+BSPBCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+uint32_t
+BSPBCWorkload::apply(WorkListItem& wl)
+{
+    return wl.prop;
+}
+
+void
+BSPBCWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+}
+
+bool
+BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
+    return (depth == currentDepth) && (new_wl.degree > 0);
+}
+
+std::string
+BSPBCWorkload::printWorkListItem(WorkListItem wl)
+{
+    uint32_t temp_depth = (wl.tempProp & depthMask) >> 24;
+    uint32_t temp_count = (wl.tempProp & countMask);
+    uint32_t depth = (wl.prop & depthMask) >> 24;
+    uint32_t count = (wl.prop & countMask);
+    return csprintf(
+            "WorkListItem{tempProp: (depth: %d, count: %d), "
+            "prop: (depth: %d, count: %d), degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
new file mode 100644
index 0000000000..481cfc146f
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.hh
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+#define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+
+#include <bitset>
+#include <deque>
+#include <tuple>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/work_directory.hh"
+#include "mem/packet.hh"
+
+
+namespace gem5
+{
+
+class GraphWorkload
+{
+  public:
+    GraphWorkload() {}
+    ~GraphWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0;
+    virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
+    virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) { return true; }
+    virtual void iterate() = 0;
+    virtual void interIterationInit(WorkListItem& wl) = 0;
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
+    virtual std::string printWorkListItem(const WorkListItem wl) = 0;
+};
+
+class BFSWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) override { return lhs < rhs; }
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {}
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class BFSVisitedWorkload : public BFSWorkload
+{
+  public:
+    BFSVisitedWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
+    {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
+
+class CCWorkload : public BFSVisitedWorkload
+{
+  public:
+    CCWorkload(): BFSVisitedWorkload(0, 0) {}
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+};
+
+class SSSPWorkload : public BFSWorkload
+{
+  public:
+    SSSPWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
+    {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
+
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {};
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class BSPPRWorkload : public GraphWorkload
+{
+  private:
+    int numNodes;
+    float alpha;
+    float prevError;
+    float error;
+
+  public:
+    BSPPRWorkload(int num_nodes, float alpha):
+        numNodes(num_nodes), alpha(alpha), prevError(0), error(0)
+    {}
+
+    ~BSPPRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { prevError = error; error = 0; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+
+    float getError() { return prevError; }
+};
+
+class BSPBCWorkload : public GraphWorkload
+{
+  private:
+    Addr initAddr;
+    uint32_t initValue;
+
+    int currentDepth;
+
+    uint32_t depthMask;
+    uint32_t countMask;
+  public:
+    BSPBCWorkload(Addr init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value),
+        currentDepth(0), depthMask(4278190080), countMask(16777215)
+    {}
+
+    ~BSPBCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { currentDepth++; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py
new file mode 100644
index 0000000000..10d8b708f0
--- /dev/null
+++ b/src/accl/graph/sega/BaseMemoryEngine.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseMemoryEngine(ClockedObject):
+    abstract = True
+    type = 'BaseMemoryEngine'
+    cxx_header = "accl/graph/sega/base_memory_engine.hh"
+    cxx_class = 'gem5::BaseMemoryEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
new file mode 100644
index 0000000000..619e76f1ee
--- /dev/null
+++ b/src/accl/graph/sega/CenteralController.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.util.pybind import PyBindMethod
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CenteralController(BaseMemoryEngine):
+    type = 'CenteralController'
+    cxx_header = "accl/graph/sega/centeral_controller.hh"
+    cxx_class = 'gem5::CenteralController'
+
+    mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
+
+    choose_best = Param.Bool("Whether to prefer the best update "
+                            "value for choosing the next slice")
+
+    vertex_image_file = Param.String("Path to the vertex image file.")
+
+    mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.")
+
+    mpu_vector = VectorParam.MPU("All mpus in the system.")
+
+    router_vector = VectorParam.RouterEngine("All Routers in the system.")
+
+    cxx_exports = [
+                    PyBindMethod("setAsyncMode"),
+                    PyBindMethod("setBSPMode"),
+                    PyBindMethod("setPGMode"),
+                    PyBindMethod("createPopCountDirectory"),
+                    PyBindMethod("createBFSWorkload"),
+                    PyBindMethod("createBFSVisitedWorkload"),
+                    PyBindMethod("createSSSPWorkload"),
+                    PyBindMethod("createCCWorkload"),
+                    PyBindMethod("createAsyncPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createBCWorkload"),
+                    PyBindMethod("workCount"),
+                    PyBindMethod("getPRError"),
+                    PyBindMethod("printAnswerToHostSimout")
+                ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
new file mode 100644
index 0000000000..bb45802c1d
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CoalesceEngine(BaseMemoryEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
+    pending_pull_limit = Param.Int("Maximum number of pending pull processes.")
+    active_buffer_size = Param.Int("Maximum number of memory active memory "
+                                "atoms ready to send updates. This parameter "
+                                "and post_push_wb_queue_size should be set "
+                                "in tandem. Probably, they should be equal.")
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+    transitions_per_cycle = Param.Int("Max number of transitions in a cycle")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..8d2453b01c
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class MPU(SimObject):
+    type = "MPU"
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = "gem5::MPU"
+
+    system = Param.System(Parent.any, "System this MPU is a part of")
+
+    wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
+                                "MPU object.")
+    coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
+                                "each instance of MPU object.")
+    push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
+                                "instance of MPU object.")
+
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
new file mode 100644
index 0000000000..2174f943f4
--- /dev/null
+++ b/src/accl/graph/sega/PushEngine.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class PushEngine(BaseMemoryEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    push_req_queue_size = Param.Int("Size of the queue to "
+                                    "queue push requests.")
+    # resp_queue_size should probably be
+    # significantly bigger than push_req_queue_size
+    resp_queue_size = Param.Int("Size of the response queue in the "
+                                    "push engine where it stores the "
+                                    "edges read from memory.")
+
+    examine_window = Param.Int("Number of edges at the front of the edge queue"
+                                " to examine in order to propagate.")
+
+    max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+                                                        "done per cycle.")
+
+    update_queue_size = Param.Int("Maximum number of entries "
+                                    "for each update queue.")
+
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
new file mode 100644
index 0000000000..2b895b9323
--- /dev/null
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class RouterEngine(ClockedObject):
+    type = "RouterEngine"
+    cxx_header = "accl/graph/sega/router_engine.hh"
+    cxx_class = "gem5::RouterEngine"
+
+    system = Param.System(Parent.any, "System this Engine is a part of")
+
+    gpt_req_side = VectorRequestPort("Outgoing ports to local GPTs")
+    gpt_resp_side = VectorResponsePort("incoming ports from local GPTs")
+    
+    gpn_req_side = VectorRequestPort("Outgoing ports to remote GPNs")
+    gpn_resp_side = VectorResponsePort("incoming ports from local GPNs")
+    gpt_queue_size = Param.Int(64, "Queue size on the gpt side")
+    gpn_queue_size = Param.Int(64, "Queue size on the gpt side")
+    token = Param.Int("Number of tokens sent per time sample.")
+    router_latency = Param.Cycles(5, "Router latency, "
+                                "SerDes or E-O-E latencies can be added here")
+    
+    sample_time = Param.Latency("50us", "Intervals to sample traffic")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
new file mode 100644
index 0000000000..e0a3f8d28f
--- /dev/null
+++ b/src/accl/graph/sega/SConscript
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"])
+SimObject("CenteralController.py", sim_objects=["CenteralController"])
+SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
+SimObject("MPU.py", sim_objects=["MPU"])
+SimObject("PushEngine.py", sim_objects=["PushEngine"])
+SimObject("WLEngine.py", sim_objects=["WLEngine"])
+SimObject("RouterEngine.py", sim_objects=["RouterEngine"])
+
+Source("base_memory_engine.cc")
+Source("centeral_controller.cc")
+Source("coalesce_engine.cc")
+Source("enums.cc")
+Source("mpu.cc")
+Source("push_engine.cc")
+Source("wl_engine.cc")
+Source("router_engine.cc")
+
+DebugFlag("BaseMemoryEngine")
+DebugFlag("CenteralController")
+DebugFlag("CacheBlockState")
+DebugFlag("CoalesceEngine")
+DebugFlag("PushEngine")
+DebugFlag("SEGAStructureSize")
+DebugFlag("MSDebug")
+DebugFlag("WLEngine")
+DebugFlag("RouterEngine")
+
+CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
+                    "WLEngine", "BaseMemoryEngine", "RouterEngine"])
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
new file mode 100644
index 0000000000..f9ea4488df
--- /dev/null
+++ b/src/accl/graph/sega/WLEngine.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReduceEngine import BaseReduceEngine
+
+class WLEngine(BaseReduceEngine):
+    type = 'WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
+
+    update_queue_size = Param.Int("Size of the queue WLEngine stores "
+                                        "the incoming updates")
+
+    register_file_size = Param.Int("Number of internal registers the "
+                                    "WLEngine has. It can service as "
+                                    "many updates as this queueu has "
+                                    "entries at the same time.")
+
+    examine_window = Param.Int("Number of updates at the front of update "
+                                "queue examined for reading.")
+    rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
+    reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
+    wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
+
+
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
new file mode 100644
index 0000000000..9f704f71e9
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/base_memory_engine.hh"
+
+#include "debug/BaseMemoryEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+
+namespace gem5
+{
+
+BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this)),
+    memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
+{}
+
+BaseMemoryEngine::~BaseMemoryEngine()
+{}
+
+Port&
+BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseMemoryEngine::init()
+{
+    AddrRangeList memory_ranges = memPort.getAddrRanges();
+
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
+            "%s. The range is %s interleaved.\n", __func__,
+            peerMemoryRange.to_string(),
+            peerMemoryRange.interleaved() ? "" : "not");
+}
+
+void
+BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to "
+                "the memory.\n", __func__, pkt->print());
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__);
+    } else {
+        DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__);
+        owner->recvMemRetry();
+    }
+}
+
+bool
+BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseMemoryEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket),
+            "Received retry without a blockedPacket");
+
+    _blocked = false;
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+PacketPtr
+BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+PacketPtr
+BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+}
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
new file mode 100644
index 0000000000..afe7fd0433
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseMemoryEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseMemoryEngine : public ClockedObject
+{
+  protected:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+        int _prevState;
+
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name),
+            _pending(false), _prevState(0)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+        void setPrevState(int state) { _prevState = state; }
+        int getPrevState() { return _prevState; }
+    };
+
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseMemoryEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseMemoryEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    const RequestorID _requestorId;
+
+    MemPort memPort;
+    AddrRange peerMemoryRange;
+    size_t peerMemoryAtomSize;
+
+    virtual void recvMemRetry() = 0;
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
+  public:
+    PARAMS(BaseMemoryEngine);
+
+    BaseMemoryEngine(const Params &params);
+    ~BaseMemoryEngine();
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
+
+    virtual void recvFunctional(PacketPtr pkt) = 0;
+
+    virtual void init() override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr
new file mode 100644
index 0000000000..316fcd37d9
--- /dev/null
+++ b/src/accl/graph/sega/busyMaskErr
@@ -0,0 +1,16 @@
+gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0
+
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+
+// This assertion would be hit although it should not.
+// It is fixed by a hack in recvWLRead when hit in the cache.
+assert(cacheBlocks[block_index].busyMask == 0);
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
new file mode 100644
index 0000000000..09b57b6ff6
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/centeral_controller.hh"
+
+#include <cmath>
+#include <iostream>
+
+#include "base/cprintf.hh"
+#include "base/loader/memory_image.hh"
+#include "base/loader/object_file.hh"
+#include "debug/CenteralController.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CenteralController::CenteralController(const Params& params):
+    BaseMemoryEngine(params),
+    mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
+    mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
+    chooseBest(params.choose_best),
+    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()),
+    stats(*this)
+{
+    uint64_t total_cache_size = 0;
+    for (auto mpu : params.mpu_vector) {
+        mpuVector.push_back(mpu);
+        mpu->registerCenteralController(this);
+        total_cache_size += mpu->getCacheSize();
+    }
+
+    // for (auto router : params.router_vector) {
+    //     routerVector.push_back(router);
+    //     router->registerCenteralController(this);
+    // }
+    verticesPerSlice = std::floor(total_cache_size / sizeof(WorkListItem));
+}
+
+Port&
+CenteralController::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "mirrors_map_mem") {
+        return mapPort;
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort("mem_port", idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+
+    for (auto router : params().router_vector) {
+        routerVector.push_back(router);
+        router->registerCenteralController(this);
+    }
+}
+
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSVisitedWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new SSSPWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createCCWorkload()
+{
+    workload = new CCWorkload();
+}
+
+void
+CenteralController::createAsyncPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
+void
+CenteralController::createPRWorkload(int num_nodes, float alpha)
+{
+    workload = new BSPPRWorkload(num_nodes, alpha);
+}
+
+void
+CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BSPBCWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createPopCountDirectory(int atoms_per_block)
+{
+    fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
+                        "mode by calling either setAsyncMode or setBSPMode.");
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createBSPPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::POLY_GRAPH) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+}
+
+void
+CenteralController::startup()
+{
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    for (auto mpu: mpuVector) {
+        for (auto range: mpu->getAddrRanges()) {
+            mpuAddrMap.insert(range, mpu);
+        }
+        mpu->setProcessingMode(mode);
+        mpu->recvWorkload(workload);
+    }
+
+    const auto& vertex_file = params().vertex_image_file;
+    if (vertex_file == "")
+        return;
+
+    auto* object = loader::createObjectFile(vertex_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
+
+    loader::debugSymbolTable.insert(*object->symtab().globals());
+    loader::MemoryImage vertex_image = object->buildImage();
+    maxVertexAddr = vertex_image.maxAddr();
+
+    int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem));
+    numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice);
+
+    numPendingUpdates = new int [numTotalSlices];
+    bestPendingUpdate = new uint32_t [numTotalSlices];
+    for (int i = 0; i < numTotalSlices; i++) {
+        numPendingUpdates[i] = 0;
+        bestPendingUpdate[i] = -1;
+    }
+
+    PortProxy vertex_proxy(
+    [this](PacketPtr pkt) {
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
+    }, vertex_atom);
+
+    panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
+
+    for (auto mpu: mpuVector) {
+        mpu->postMemInitSetup();
+        if (!mpu->running() && (mpu->workCount() > 0)) {
+            mpu->start();
+        }
+    }
+    workload->iterate();
+}
+
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                "is blocked.\n", __func__, _id, pkt->print());
+        blockedPacket = pkt;
+    } else {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                    "sent.\n", __func__, _id, pkt->print());
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp should not be called at all");
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic("recvReqRetry should not be called at all");
+}
+
+void
+CenteralController::recvDoneSignal()
+{
+    bool done = true;
+    for (auto mpu : mpuVector) {
+        done &= mpu->done();
+    }
+
+    // for (auto router : routerVector) {
+    //     done &= router->done();
+    // }
+
+    if (done && mode == ProcessingMode::ASYNCHRONOUS) {
+        exitSimLoopNow("no update left to process.");
+    }
+
+    if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->postConsumeProcess();
+            mpu->swapDirectories();
+            if (!mpu->running() && (mpu->workCount() > 0)) {
+                mpu->start();
+            }
+        }
+        workload->iterate();
+        exitSimLoopNow("finished an iteration.");
+    }
+
+    if (done && mode == ProcessingMode::POLY_GRAPH) {
+        DPRINTF(CenteralController, "%s: Received done signal.\n", __func__);
+        exitSimLoopNow("Finished processing a slice.");
+        if (!nextSliceSwitchEvent.scheduled()) {
+            schedule(nextSliceSwitchEvent, nextCycle());
+        }
+    }
+}
+
+int
+CenteralController::chooseNextSlice()
+{
+    int ret_slice_id = -1;
+    int max_pending_count = 0;
+    // TODO: Make this generalizable for all workloads.
+    uint32_t best_update = -1;
+    for (int i = 0; i < numTotalSlices; i++) {
+        if (numPendingUpdates[i] > max_pending_count) {
+            max_pending_count = numPendingUpdates[i];
+        }
+        if (numPendingUpdates[i] > 0 &&
+            workload->betterThan(bestPendingUpdate[i], best_update)) {
+            best_update = bestPendingUpdate[i];
+        }
+    }
+    if (chooseBest) {
+        int max_count = 0;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] > max_count &&
+                bestPendingUpdate[i] == best_update) {
+                max_count = numPendingUpdates[i];
+                ret_slice_id = i;
+            }
+        }
+    } else {
+        uint32_t best_value = -1;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] == max_pending_count &&
+                workload->betterThan(bestPendingUpdate[i], best_value)) {
+                best_value = bestPendingUpdate[i];
+                ret_slice_id = i;
+            }
+        }
+    }
+    return ret_slice_id;
+}
+
+void
+CenteralController::processNextSliceSwitchEvent()
+{
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int vertices_per_atom = (int) vertex_atom / sizeof(WorkListItem);
+    int bytes_accessed = 0;
+    int updates_generated_total =  0;
+    for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) {
+        if (dst_id == currentSliceId) {
+            continue;
+        }
+        int updates_generated = 0;
+        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t);
+        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
+        delete start;
+        delete end;
+        DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
+                            currentSliceId, dst_id, start_addr, end_addr);
+
+        int num_bytes = end_addr - start_addr;
+        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+
+        WorkListItem vertices [vertices_per_atom];
+        for (int i = 0; i < num_mirrors; i++) {
+            Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+            Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+            int wl_offset = (int) (org_addr - aligned_org_addr) / sizeof(WorkListItem);
+            PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+            auto routing_entry = mpuAddrMap.contains(aligned_org_addr);
+            routing_entry->second->recvFunctional(read_org);
+            read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
+            delete read_org;
+             if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
+                assert(vertices[wl_offset].degree == 0);
+                vertices[wl_offset].prop = vertices[wl_offset].tempProp;
+            }
+            if (mirrors[i].prop != vertices[wl_offset].prop) {
+                mirrors[i].prop = vertices[wl_offset].prop;
+                if (!mirrors[i].activeNow) {
+                    mirrors[i].activeNow = true;
+                    numPendingUpdates[dst_id]++;
+                    totalUpdatesLeft++;
+                    updates_generated++;
+                }
+                bestPendingUpdate[dst_id] =
+                    workload->betterThan(mirrors[i].prop, bestPendingUpdate[dst_id]);
+            }
+        }
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        delete [] mirrors;
+        DPRINTF(CenteralController, "%s: Done scattering updates from slice "
+                        "%d to slice %d.\n", __func__, currentSliceId, dst_id);
+        DPRINTF(CenteralController, "%s: Generated %d updates from slice "
+                                        "%d to slice %d.\n", __func__,
+                                    updates_generated, currentSliceId, dst_id);
+        updates_generated_total += updates_generated;
+        bytes_accessed += 2 * num_bytes;
+    }
+    DPRINTF(CenteralController, "%s: Done with slice %d.\n", __func__, currentSliceId);
+    DPRINTF(CenteralController, "%s: Generated a total of %d updates.\n",
+                                        __func__, updates_generated_total);
+    DPRINTF(CenteralController, "%s: There are a total of %d "
+                                "updates left.\n", __func__, totalUpdatesLeft);
+    if (totalUpdatesLeft > 0) {
+        currentSliceId = chooseNextSlice();
+    } else {
+        exitSimLoopNow("Done with all the slices.");
+        return;
+    }
+    DPRINTF(CenteralController, "%s: Chose %d as the "
+                                    "next slice.\n", __func__, currentSliceId);
+
+    for (int src_id = 0; src_id < numTotalSlices; src_id++) {
+        if (src_id == currentSliceId) {
+            continue;
+        }
+        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t);
+        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
+        delete start;
+        delete end;
+
+        int num_bytes = end_addr - start_addr;
+        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+        for (int i = 0; i < num_mirrors; i++) {
+            if (mirrors[i].activeNow) {
+                Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+                auto routing_entry = mpuAddrMap.contains(org_addr);
+                routing_entry->second->recvMirrorPush(org_addr, mirrors[i].prop,
+                                        mirrors[i].edgeIndex, mirrors[i].degree);
+                mirrors[i].activeNow = false;
+                numPendingUpdates[currentSliceId]--;
+                totalUpdatesLeft--;
+            }
+        }
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        delete [] mirrors;
+        DPRINTF(CenteralController, "%s: Done gathering updates from slice "
+                        "%d to slice %d.\n", __func__, src_id, currentSliceId);
+        bytes_accessed += num_bytes;
+    }
+
+    double mirror_mem_bw = mirrorsMem->getBW();
+    Tick time_to_switch = bytes_accessed * mirror_mem_bw;
+    stats.switchTicks += time_to_switch;
+    stats.switchedBytes += bytes_accessed;
+    stats.numSwitches++;
+    for (auto mpu: mpuVector) {
+        mpu->startProcessingMirrors(time_to_switch);
+    }
+    exitSimLoopNow("Done with slice switch.");
+}
+
+bool
+CenteralController::handleMemResp(PacketPtr pkt)
+{
+    panic("handleMemResp should not be called at all");
+}
+
+void
+CenteralController::recvMemRetry()
+{
+    panic("recvMemRetry should not be called at all");
+}
+
+void
+CenteralController::recvFunctional(PacketPtr pkt)
+{
+    panic("recvFunctional should not be called at all");
+}
+
+int
+CenteralController::workCount()
+{
+    int work_count = 0;
+    for (auto mpu: mpuVector) {
+        work_count += mpu->workCount();
+    }
+    return work_count;
+}
+
+float
+CenteralController::getPRError()
+{
+    BSPPRWorkload* pr_workload = dynamic_cast<BSPPRWorkload*>(workload);
+    return pr_workload->getError();
+}
+
+void
+CenteralController::printAnswerToHostSimout()
+{
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int num_items = vertex_atom / sizeof(WorkListItem);
+    WorkListItem items[num_items];
+    for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
+    {
+        PacketPtr pkt = createReadPacket(addr, vertex_atom);
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
+        pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
+        for (int i = 0; i < num_items; i++) {
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
+                                        workload->printWorkListItem(items[i]));
+
+            std::cout << print << std::endl;
+        }
+        delete pkt;
+    }
+}
+
+CenteralController::ControllerStats::ControllerStats(CenteralController& _ctrl):
+    statistics::Group(&_ctrl), ctrl(_ctrl),
+    ADD_STAT(numSwitches, statistics::units::Byte::get(),
+             "Number of slices switches completed."),
+    ADD_STAT(switchedBytes, statistics::units::Byte::get(),
+             "Number of bytes accessed during slice switching."),
+    ADD_STAT(switchTicks, statistics::units::Tick::get(),
+             "Number of ticks spent switching slices."),
+    ADD_STAT(switchSeconds, statistics::units::Second::get(),
+             "Traversed Edges Per Second.")
+{
+}
+
+void
+CenteralController::ControllerStats::regStats()
+{
+    using namespace statistics;
+
+    switchSeconds = switchTicks / simFreq;
+}
+
+}
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
new file mode 100644
index 0000000000..ac06b76edc
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+
+#include <cmath>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/router_engine.hh"
+#include "base/addr_range.hh"
+#include "base/intmath.hh"
+#include "mem/simple_mem.hh"
+#include "params/CenteralController.hh"
+
+namespace gem5
+{
+
+class CenteralController : public BaseMemoryEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner, PortID id):
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort mapPort;
+    Addr maxVertexAddr;
+    ProcessingMode mode;
+
+    memory::SimpleMemory* mirrorsMem;
+
+    std::vector<MPU*> mpuVector;
+    AddrRangeMap<MPU*> mpuAddrMap;
+    std::vector<RouterEngine*> routerVector;
+
+    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
+
+    int currentSliceId;
+    int numTotalSlices;
+    int verticesPerSlice;
+    int totalUpdatesLeft;
+
+    bool chooseBest;
+    int* numPendingUpdates;
+    uint32_t* bestPendingUpdate;
+    int chooseNextSlice();
+
+    EventFunctionWrapper nextSliceSwitchEvent;
+    void processNextSliceSwitchEvent();
+
+    struct ControllerStats : public statistics::Group
+    {
+      ControllerStats(CenteralController& ctrl);
+
+      void regStats() override;
+
+      CenteralController& ctrl;
+
+      statistics::Scalar numSwitches;
+      statistics::Scalar switchedBytes;
+      statistics::Scalar switchTicks;
+      statistics::Formula switchSeconds;
+    };
+    ControllerStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    GraphWorkload* workload;
+
+    PARAMS(CenteralController);
+    CenteralController(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+
+    virtual void startup() override;
+
+    virtual void recvFunctional(PacketPtr pkt) override;
+
+    void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
+    void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+    void setPGMode() { mode = ProcessingMode::POLY_GRAPH; }
+
+    void createPopCountDirectory(int atoms_per_block);
+
+    void createBFSWorkload(Addr init_addr, uint32_t init_value);
+    void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
+    void createSSSPWorkload(Addr init_addr, uint32_t init_value);
+    void createCCWorkload();
+    void createAsyncPRWorkload(float alpha, float threshold);
+    void createPRWorkload(int num_nodes, float alpha);
+    void createBCWorkload(Addr init_addr, uint32_t init_value);
+
+    void recvDoneSignal();
+
+    int workCount();
+    float getPRError();
+    void printAnswerToHostSimout();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
new file mode 100644
index 0000000000..5e0c8c8095
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -0,0 +1,1322 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    numReceivedPulls(0), numScheduledPulls(0), pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    transitionsPerCycle(params.transitions_per_cycle),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    numActiveBlocksNow = UniqueFIFO<int>(numLines);
+    numActiveBlocksNext = UniqueFIFO<int>(numLines);
+
+    activeBuffer.clear();
+    postPushWBQueue.clear();
+    blocksTouchedThisTick.clear();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+
+// NOTE: Used for initializing memory and reading the final answer
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        graphWorkload->init(pkt, currentDirectory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
+        memPort.sendFunctional(pkt);
+    }
+}
+
+void
+CoalesceEngine::postMemInitSetup()
+{
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::postConsumeProcess()
+{
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                numActiveBlocksNext.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                numActiveBlocksNext.erase(block_index);
+            }
+        } else {
+            WorkListItem items[numElementsPerLine];
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete read_pkt;
+            delete write_pkt;
+        }
+    }
+}
+
+void
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(numActiveBlocksNow.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    numActiveBlocksNow.clear();
+    numActiveBlocksNow = numActiveBlocksNext;
+    numActiveBlocksNext.clear();
+}
+
+bool
+CoalesceEngine::done()
+{
+    return memAccBuffer.empty() && numActiveBlocksNow.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
+}
+
+bool
+CoalesceEngine::enoughSpace()
+{
+    return (activeBuffer.size() + pendingPullReads + numScheduledPulls) < activeBufferSize;
+}
+
+bool
+CoalesceEngine::pullCondition()
+{
+    bool enough_space = enoughSpace();
+    bool schedule_limit = numScheduledPulls < pendingPullLimit;
+    return enough_space && schedule_limit;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+ReadReturnStatus
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    if (lastReadTick < curTick()) {
+        blocksTouchedThisTick.clear();
+        lastReadTick = curTick();
+    }
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        // Hit
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        cacheBlocks[block_index].state = CacheState::BUSY;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        blocksTouchedThisTick.insert(block_index);
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].dirty);
+
+        assert(MSHR.find(block_index) != MSHR.end());
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        blocksTouchedThisTick.insert(block_index);
+
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else {
+        // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+        stats.readMisses++;
+        if (blocksTouchedThisTick.find(block_index) != blocksTouchedThisTick.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has already been "
+                            "accessed this tick.\n", __func__, block_index);
+            return ReadReturnStatus::REJECT_ROLL;
+        }
+        if (cacheBlocks[block_index].state != CacheState::INVALID) {
+            // conflict miss
+            DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
+                "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr);
+            cacheBlocks[block_index].hasConflict = true;
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                if (cacheBlocks[block_index].dirty) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is dirty.\n",
+                                                        __func__, block_index);
+                    cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memAccBuffer.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is now "
+                            "pending write back.\n", __func__, block_index);
+                } else {
+                    // NOTE: The cache block could still be active but
+                    // not dirty. If active we only have to active tracking
+                    // but can throw the data away.
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not dirty.\n",
+                                                        __func__, block_index);
+                    bool atom_active_now = false;
+                    bool atom_active_future = false;
+                    for (int index = 0; index < numElementsPerLine; index++) {
+                        atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                        atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                    }
+                    if (atom_active_now) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n",
+                                                        __func__, block_index);
+                        numActiveBlocksNow.erase(block_index);
+                        int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.currentFrontierSize.sample(currentDirectory->workCount());
+                        stats.countActiveBlocksNow.sample(count);
+                    }
+                    if (atom_active_future) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n",
+                                                        __func__, block_index);
+                        numActiveBlocksNext.erase(block_index);
+                        int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.futureFrontierSize.sample(futureDirectory->workCount());
+                        stats.countActiveBlocksNext.sample(count);
+                    }
+                    // NOTE: Bring the cache line to invalid state.
+                    // NOTE: Above line where we set hasConflict to true
+                    // does not matter anymore since we reset the cache line.
+                    cacheBlocks[block_index].reset();
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is reset.\n",
+                                                        __func__, block_index);
+                }
+                blocksTouchedThisTick.insert(block_index);
+                return ReadReturnStatus::REJECT_NO_ROLL;
+            } else {
+                blocksTouchedThisTick.insert(block_index);
+                stats.numConflicts++;
+                return ReadReturnStatus::REJECT_ROLL;
+            }
+        } else {
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
+            MSHR[block_index].push_back(addr);
+            memAccBuffer.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            blocksTouchedThisTick.insert(block_index);
+            return ReadReturnStatus::ACCEPT;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+
+    onTheFlyReqs--;
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
+        }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
+
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                numActiveBlocksNow.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.countActiveBlocksNow.sample(count);
+            }
+            if (atom_active_future) {
+                int count = futureDirectory->deactivate(addr);
+                numActiveBlocksNext.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.countActiveBlocksNext.sample(count);
+            }
+
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.countActiveBlocksNow.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.countActiveBlocksNext.sample(count);
+                }
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                stats.wastefulBytesRead += pkt->getSize();
+                delete pkt;
+            }
+
+            if (pullCondition()) {
+                memAccBuffer.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, -1, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                numScheduledPulls++;
+            }
+        }
+        delete purpose;
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+
+    // NOTE: Design does not allow for write misses.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].dirty |= true;
+    }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) {
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
+        if (active && (!numActiveBlocksNow.find(block_index))) {
+            numActiveBlocksNow.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
+        if (active && (!numActiveBlocksNext.find(block_index))) {
+            numActiveBlocksNext.push_back(block_index);
+        }
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if (cacheBlocks[block_index].busyMask == 0) {
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memAccBuffer.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+            } else {
+                bool atom_active_now = false;
+                bool atom_active_future = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                }
+                if (atom_active_now) {
+                    numActiveBlocksNow.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.countActiveBlocksNow.sample(count);
+                }
+                if (atom_active_future) {
+                    numActiveBlocksNext.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.countActiveBlocksNext.sample(count);
+                }
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    int num_transitions = 0;
+    std::unordered_set<int> transitions;
+    MemoryFunctionDeque temp_deque;
+    temp_deque.clear();
+
+    while (true) {
+        if (memPort.blocked()) {
+            while (!temp_deque.empty()) {
+                memAccBuffer.push_front(temp_deque.back());
+                temp_deque.pop_back();
+            }
+            stats.numMemoryBlocks++;
+            nextMemoryEvent.sleep();
+            return;
+        }
+        DPRINTF(CoalesceEngine, "%s: Processing another "
+                            "memory function.\n", __func__);
+        std::function<void(int, Tick)> function;
+        int input;
+        Tick tick;
+        std::tie(function, input, tick) = memAccBuffer.front();
+        if ((transitions.find(input) == transitions.end()) || (input == -1)) {
+            function(input, tick);
+            memAccBuffer.pop_front();
+            transitions.insert(input);
+            stats.memAccBufferLat.sample((curTick() - tick) * 1e9 / getClockFrequency());
+            DPRINTF(CoalesceEngine, "%s: Popped a function from memAccBuffer. "
+                    "memAccBuffer.size = %d.\n", __func__, memAccBuffer.size());
+            num_transitions++;
+        } else {
+            temp_deque.emplace_back(function, input, tick);
+            memAccBuffer.pop_front();
+        }
+        if ((num_transitions >= transitionsPerCycle) || memAccBuffer.empty()) {
+            break;
+        }
+    }
+
+    while (!temp_deque.empty()) {
+        memAccBuffer.push_front(temp_deque.back());
+        temp_deque.pop_back();
+    }
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memAccBuffer.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    // TODO: Figure out if this is still necessary.
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+    bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                numActiveBlocksNext.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+        } else {
+            wb++;
+        }
+    }
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            numActiveBlocksNow.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                numActiveBlocksNext.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            if (pullCondition()) {
+                memAccBuffer.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, -1, curTick());
+                numScheduledPulls++;
+            }
+        } else {
+            ab++;
+        }
+    }
+    if (!need_send_pkt) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
+
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
+
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active_now = false;
+        bool atom_active_future = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+        }
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        if (atom_active_future) {
+            numActiveBlocksNext.erase(block_index);
+        }
+        if (atom_active_now) {
+            numActiveBlocksNow.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.countActiveBlocksNow.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.countActiveBlocksNext.sample(count);
+                }
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.countActiveBlocksNext.sample(count);
+            }
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
+        cacheBlocks[block_index].reset();
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
+    numScheduledPulls--;
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
+        }
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
+        }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
+        }
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+int
+CoalesceEngine::workCount()
+{
+    return numActiveBlocksNow.size() + currentDirectory->workCount() + activeBuffer.size();
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    numReceivedPulls++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. numReceivedPulls: %d.\n", __func__, numReceivedPulls);
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
+            if (items[index].activeNow) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                numReceivedPulls--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active_now = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= items[index].activeNow;
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active_now) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            activeBuffer.pop_front();
+            memAccBuffer.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, -1, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!numActiveBlocksNow.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = numActiveBlocksNow.size();
+        while (true) {
+            int block_index = numActiveBlocksNow.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        numReceivedPulls--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+                    }
+                }
+
+                bool atom_active_now = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active_now) {
+                    numActiveBlocksNow.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            numActiveBlocksNow.pop_front();
+            numActiveBlocksNow.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
+    }
+
+    if (pullCondition()) {
+        memAccBuffer.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, -1, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+        numScheduledPulls++;
+    }
+
+    if ((numReceivedPulls > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce):
+    statistics::Group(&_coalesce), coalesce(_coalesce), lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(countActiveBlocksNow, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(countActiveBlocksNext, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memAccBufferLat, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    countActiveBlocksNow.init(64);
+    countActiveBlocksNext.init(64);
+    responseQueueLatency.init(64);
+    memAccBufferLat.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
new file mode 100644
index 0000000000..3a9e463595
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include <unordered_set>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/work_directory.hh"
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "params/CoalesceEngine.hh"
+
+namespace gem5
+{
+
+typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> MemoryFunctionDeque;
+
+class MPU;
+
+class CoalesceEngine : public BaseMemoryEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem* items;
+        Addr addr;
+        uint64_t busyMask;
+        bool valid;
+        bool dirty;
+        bool hasConflict;
+        CacheState state;
+        Tick lastChangedTick;
+        Block() {}
+        Block(int num_elements):
+          addr(-1),
+          busyMask(0),
+          valid(false),
+          dirty(false),
+          hasConflict(false),
+          state(CacheState::INVALID),
+          lastChangedTick(0)
+        {
+          items = new WorkListItem [num_elements];
+        }
+
+        void reset() {
+            addr = -1;
+            busyMask = 0;
+            valid = false;
+            dirty = false;
+            hasConflict = false;
+            state = CacheState::INVALID;
+            lastChangedTick = 0;
+        }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                dirty ? "true" : "false", hasConflict ? "true" : "false",
+                cacheStateStrings[state], lastChangedTick);
+        }
+    };
+
+    struct ReadPurpose : public Packet::SenderState
+    {
+      ReadDestination _dest;
+      ReadPurpose(ReadDestination dest): _dest(dest) {}
+      ReadDestination dest() { return _dest; }
+    };
+
+    MPU* owner;
+    ProcessingMode mode;
+    WorkDirectory* currentDirectory;
+    WorkDirectory* futureDirectory;
+    GraphWorkload* graphWorkload;
+
+    Addr lastAtomAddr;
+
+    int numLines;
+    int numElementsPerLine;
+    Block* cacheBlocks;
+
+    Tick lastReadTick;
+    std::unordered_set<int> blocksTouchedThisTick;
+
+    int onTheFlyReqs;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    // Response route to WLEngine
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
+
+    // Tracking work in cache
+    int numReceivedPulls;
+    // NOTE: Remember to erase from these upon eviction from cache
+    UniqueFIFO<int> numActiveBlocksNow;
+    UniqueFIFO<int> numActiveBlocksNext;
+
+    int numScheduledPulls;
+    int pendingPullLimit;
+    int pendingPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_set<Addr> pendingPullAddrs;
+
+    int activeBufferSize;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
+
+    bool enoughSpace();
+    bool pullCondition();
+    int getBlockIndex(Addr addr);
+
+    int transitionsPerCycle;
+    MemoryFunctionDeque memAccBuffer;
+
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
+
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
+
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
+    struct CoalesceStats : public statistics::Group
+    {
+        CoalesceStats(CoalesceEngine& coalesce);
+
+        virtual void regStats() override;
+
+        virtual void resetStats() override;
+
+        CoalesceEngine &coalesce;
+
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar numConflicts;
+        statistics::Scalar responsePortShortage;
+        statistics::Scalar numMemoryBlocks;
+        statistics::Scalar wastefulBytesRead;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+        statistics::Scalar worklessCycles;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram currentFrontierSize;
+        statistics::Histogram futureFrontierSize;
+        statistics::Histogram countActiveBlocksNow;
+        statistics::Histogram countActiveBlocksNext;
+        statistics::Histogram responseQueueLatency;
+        statistics::Histogram memAccBufferLat;
+    };
+
+    CoalesceStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    PARAMS(CoalesceEngine);
+    CoalesceEngine(const Params &params);
+    void registerMPU(MPU* mpu);
+
+    void setProcessingMode(ProcessingMode _mode) { mode = _mode; }
+    void createAsyncPopCountDirectory(int atoms_per_block);
+    void createBSPPopCountDirectory(int atoms_per_block);
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+
+    virtual void recvFunctional(PacketPtr pkt) override;
+    void postMemInitSetup();
+    void postConsumeProcess();
+    void swapDirectories();
+
+    ReadReturnStatus recvWLRead(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount();
+    int futureWorkCount();
+    void recvVertexPull();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
new file mode 100644
index 0000000000..ba57b387f4
--- /dev/null
+++ b/src/accl/graph/sega/enums.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/enums.hh"
+
+namespace gem5
+{
+
+const char* registerStateStrings[NUM_REGISTER_STATE] = {
+    "PENDING_READ",
+    "PENDING_REDUCE",
+    "PENDING_WRITE"
+};
+
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_WB"
+};
+
+const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
+{
+    "ACCEPT",
+    "REJECT_ROLL",
+    "REJECT_NO_ROLL"
+};
+
+const char* readDestinationStrings[NUM_READ_DESTINATION] =
+{
+    "READ_FOR_CACHE",
+    "READ_FOR_PUSH"
+};
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
new file mode 100644
index 0000000000..0f654c5386
--- /dev/null
+++ b/src/accl/graph/sega/enums.hh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__
+#define __ACCL_GRAPH_SEGA_ENUMS_HH__
+
+namespace gem5
+{
+
+enum RegisterState
+{
+    PENDING_READ,
+    PENDING_REDUCE,
+    PENDING_WRITE,
+    NUM_REGISTER_STATE
+};
+extern const char* registerStateStrings[NUM_REGISTER_STATE];
+
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_WB,
+    NUM_CACHE_STATE
+};
+extern const char* cacheStateStrings[NUM_CACHE_STATE];
+
+enum ReadReturnStatus
+{
+    ACCEPT,
+    REJECT_ROLL,
+    REJECT_NO_ROLL,
+    NUM_READ_RETURN_STATUS
+};
+extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS];
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH,
+    NUM_READ_DESTINATION
+};
+extern const char* readDestinationStrings[NUM_READ_DESTINATION];
+
+enum ProcessingMode
+{
+    NOT_SET,
+    ASYNCHRONOUS,
+    BULK_SYNCHRONOUS,
+    POLY_GRAPH,
+    NUM_PROCESSING_MODE
+};
+extern const char* processingModeStrings[NUM_PROCESSING_MODE];
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..a5063cf685
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+MPU::MPU(const Params& params):
+    SimObject(params),
+    system(params.system),
+    wlEngine(params.wl_engine),
+    coalesceEngine(params.coalesce_engine),
+    pushEngine(params.push_engine)
+{
+    wlEngine->registerMPU(this);
+    coalesceEngine->registerMPU(this);
+    pushEngine->registerMPU(this);
+}
+
+void
+MPU::registerCenteralController(CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+bool
+MPU::handleIncomingUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleIncomingUpdate(pkt);
+}
+
+void
+MPU::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    wlEngine->handleIncomingWL(addr, wl);
+}
+
+void
+MPU::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    coalesceEngine->recvWLWrite(addr, wl);
+}
+
+void
+MPU::recvWorkload(GraphWorkload* workload)
+{
+    coalesceEngine->recvWorkload(workload);
+    pushEngine->recvWorkload(workload);
+    wlEngine->recvWorkload(workload);
+}
+
+void
+MPU::recvVertexPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvVertexPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvMirrorPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvMirrorPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvDoneSignal()
+{
+    if (done()) {
+        centeralController->recvDoneSignal();
+    }
+}
+
+bool
+MPU::done()
+{
+    return wlEngine->done() && coalesceEngine->done() && pushEngine->done();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..4afb2081ca
--- /dev/null
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include <unordered_map>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "sim/sim_object.hh"
+#include "sim/system.hh"
+#include "params/MPU.hh"
+
+namespace gem5
+{
+
+class CenteralController;
+
+class MPU : public SimObject
+{
+  private:
+    System* system;
+    CenteralController* centeralController;
+
+    WLEngine* wlEngine;
+    CoalesceEngine* coalesceEngine;
+    PushEngine* pushEngine;
+
+  public:
+    PARAMS(MPU);
+    MPU(const Params& params);
+    void registerCenteralController(CenteralController* centeral_controller);
+
+    void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
+
+    unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
+    AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    uint64_t getCacheSize() { return coalesceEngine->params().cache_size; }
+    void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+    void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+    void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
+    void swapDirectories() { coalesceEngine->swapDirectories(); }
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+    ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    void recvWLWrite(Addr addr, WorkListItem wl);
+    void recvWorkload(GraphWorkload* Workload);
+
+    int workCount() { return coalesceEngine->workCount(); }
+    void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
+    bool running() { return pushEngine->running(); }
+    void start() { return pushEngine->start(); }
+    void recvVertexPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait) { pushEngine->startProcessingMirrors(time_to_wait); }
+
+    void recvDoneSignal();
+    bool done();
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
new file mode 100644
index 0000000000..6040989070
--- /dev/null
+++ b/src/accl/graph/sega/push_engine.cc
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/PushEngine.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const Params& params):
+    BaseMemoryEngine(params),
+    _running(false),
+    lastIdleEntranceTick(0),
+    numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
+    onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    examineWindow(params.examine_window),
+    maxPropagatesPerCycle(params.max_propagates_per_cycle),
+    updateQueueSize(params.update_queue_size),
+    nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
+    nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
+    stats(*this)
+{
+    destinationQueues.clear();
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+        outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i);
+        destinationQueues.emplace_back();
+        destinationQueues[i].clear();
+        sourceAndValueMaps.emplace_back();
+        sourceAndValueMaps[i].clear();
+    }
+}
+
+Port&
+PushEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "out_ports") {
+        return outPorts[idx];
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort(if_name, idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::init()
+{
+    localAddrRange = owner->getAddrRanges();
+    for (int i = 0; i < outPorts.size(); i++){
+        AddrRangeList range_list = outPorts[i].getAddrRanges();
+        for (auto range: range_list) {
+            portAddrMap.insert(range, i);
+        }
+    }
+}
+
+void
+PushEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__);
+        blockedPacket = pkt;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. "
+            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__);
+        owner->recvReqRetry();
+    }
+}
+
+void
+PushEngine::recvReqRetry()
+{
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::vertexSpace()
+{
+    return (edgePointerQueueSize == 0) ||
+        ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize);
+}
+
+bool
+PushEngine::workLeft()
+{
+    return ((owner->workCount() - numPendingPulls) > 0);
+}
+
+bool
+PushEngine::done()
+{
+    bool empty_update_queues = true;
+    for (int i = 0; i < outPorts.size(); i++) {
+        empty_update_queues &= destinationQueues[i].empty();
+    }
+    return empty_update_queues && metaEdgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
+}
+
+void
+PushEngine::start()
+{
+    assert(!_running);
+    // assert(!nextVertexPullEvent.scheduled());
+
+    _running = true;
+    // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
+    // NOTE: We might have to check for size availability here.
+    assert(workLeft());
+    if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextVertexPullEvent()
+{
+    if (workLeft()) {
+        numPendingPulls++;
+        owner->recvVertexPull();
+        if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+            schedule(nextVertexPullEvent, nextCycle());
+        }
+    } else {
+        _running = false;
+        lastIdleEntranceTick = curTick();
+        DPRINTF(PushEngine, "%s: In idle state now.\n", __func__);
+    }
+}
+
+void
+PushEngine::recvVertexPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    assert(degree > 0);
+    assert((edgePointerQueueSize == 0) ||
+            ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
+
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+    numPendingPulls--;
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::recvMirrorPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+}
+
+void
+PushEngine::startProcessingMirrors(Tick time_to_wait)
+{
+    assert(!nextMemoryReadEvent.pending());
+    assert(!nextMemoryReadEvent.scheduled());
+    Cycles wait = ticksToCycles(time_to_wait);
+    if (!edgePointerQueue.empty()) {
+        schedule(nextMemoryReadEvent, clockEdge(wait));
+    }
+}
+
+void
+PushEngine::processNextMemoryReadEvent()
+{
+    if (memPort.blocked()) {
+        nextMemoryReadEvent.sleep();
+        return;
+    }
+    Addr aligned_addr, offset;
+    int num_edges;
+
+    EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
+    Tick entrance_tick = std::get<1>(edgePointerQueue.front());
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
+    {
+        DPRINTF(PushEngine, "%s: Current packet information generated by "
+                    "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
+
+        PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
+        PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
+        reqInfoMap[pkt->req] = push_info;
+        memPort.sendPacket(pkt);
+        onTheFlyMemReqs += num_edges;
+
+        curr_info.iterate();
+        if (curr_info.done()) {
+            DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
+            stats.edgePointerQueueLatency.sample(
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            edgePointerQueue.pop_front();
+            stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+            DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
+            "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
+        }
+    }
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if (!edgePointerQueue.empty()) {
+        assert(!nextMemoryReadEvent.pending());
+        assert(!nextMemoryReadEvent.scheduled());
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::recvMemRetry()
+{
+    if (nextMemoryReadEvent.pending()) {
+        DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
+        nextMemoryReadEvent.wake();
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
+{
+    // TODO: in case we need to edit edges, get rid of second statement.
+    assert(pkt->isResponse() && (!pkt->isWrite()));
+
+    uint8_t pkt_data [peerMemoryAtomSize];
+    PushInfo push_info = reqInfoMap[pkt->req];
+    pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
+
+    for (int i = 0; i < push_info.numElements; i++) {
+        Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
+        Addr edge_dst = edge->neighbor;
+        uint32_t edge_weight = edge->weight;
+        MetaEdge meta_edge(
+                    push_info.src, edge_dst, edge_weight, push_info.value);
+        metaEdgeQueue.emplace_back(meta_edge, curTick());
+        stats.edgeQueueLength.sample(metaEdgeQueue.size());
+    }
+    stats.edgeQueueLength.sample(metaEdgeQueue.size());
+    stats.numWastefulEdgesRead +=
+                (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
+
+    onTheFlyMemReqs -= push_info.numElements;
+    reqInfoMap.erase(pkt->req);
+
+    delete pkt;
+
+    if (!nextPropagateEvent.scheduled()) {
+        schedule(nextPropagateEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+PushEngine::processNextPropagateEvent()
+{
+    int num_propagates = 0;
+    int num_tries = 0;
+    int num_reads = 0;
+    std::deque<std::tuple<MetaEdge, Tick>> temp_edge;
+    for (int i = 0; i < examineWindow; i++) {
+        if (metaEdgeQueue.empty()) {
+            break;
+        }
+        temp_edge.push_back(metaEdgeQueue.front());
+        metaEdgeQueue.pop_front();
+    }
+    int max_visits = temp_edge.size();
+
+    while(true) {
+        MetaEdge meta_edge;
+        Tick entrance_tick;
+        std::tie(meta_edge, entrance_tick) = temp_edge.front();
+
+        DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                                __func__, meta_edge.to_string());
+
+        uint32_t update_value =
+                graphWorkload->propagate(meta_edge.value, meta_edge.weight);
+        temp_edge.pop_front();
+        num_tries++;
+
+        if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) {
+            DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
+                                            __func__, meta_edge.to_string());
+            num_reads++;
+            stats.numPropagates++;
+            stats.edgeQueueLatency.sample(
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+        } else {
+            temp_edge.emplace_back(meta_edge, entrance_tick);
+            stats.updateQueueFull++;
+        }
+        num_propagates++;
+
+        if (temp_edge.empty()) {
+            break;
+        }
+        if (num_tries >= max_visits) {
+            break;
+        }
+    }
+
+    while (!temp_edge.empty()) {
+        metaEdgeQueue.push_front(temp_edge.back());
+        temp_edge.pop_back();
+    }
+
+    stats.numPropagatesHist.sample(num_propagates);
+
+    assert(!nextPropagateEvent.scheduled());
+    if (!metaEdgeQueue.empty()) {
+        schedule(nextPropagateEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
+{
+    Addr aligned_dst = roundDown<Addr, size_t>(dst, owner->vertexAtomSize());
+    AddrRange update_range(aligned_dst, aligned_dst + owner->vertexAtomSize());
+    auto entry = portAddrMap.contains(update_range);
+    PortID port_id = entry->second;
+
+    DPRINTF(PushEngine, "%s: Update{src: %lu, dst:%lu, value: %u} "
+                        "belongs to port %d.\n",
+                        __func__, src, dst, value, port_id);
+    DPRINTF(PushEngine, "%s: There are %d updates already "
+                        "in queue for port %d.\n", __func__,
+                        destinationQueues[port_id].size(), port_id);
+
+    assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size());
+
+    int num_updates = 0;
+    for (auto queue: destinationQueues) {
+        num_updates += queue.size();
+    }
+
+    if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) {
+        DPRINTF(PushEngine, "%s: Found an existing update "
+                            "for dst: %lu.\n", __func__, dst);
+        Addr prev_src;
+        uint32_t prev_val;
+        std::tie(prev_src, prev_val) = sourceAndValueMaps[port_id][dst];
+        uint32_t new_val = graphWorkload->reduce(value, prev_val);
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(prev_src, new_val);
+        DPRINTF(PushEngine, "%s: Coalesced Update{src: %lu, dst:%lu, value: %u} "
+                            "with Update{src: %lu, dst:%lu, value: %u} to"
+                            "Update{src: %lu, dst:%lu, value: %u}.\n", __func__,
+                            src, dst, value, prev_src, dst, prev_val,
+                            prev_src, dst, new_val);
+        stats.updateQueueCoalescions++;
+        return true;
+    } else if (num_updates < (updateQueueSize * destinationQueues.size())) {
+        DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue for port %d.\n", __func__, port_id);
+        destinationQueues[port_id].emplace_back(dst, curTick());
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(src, value);
+        DPRINTF(PushEngine, "%s: Emplaced Update{src: %lu, dst:%lu, value: %u} "
+                            "at the back of queue for port %d. "
+                            "Size of queue for port %d is %d.\n", __func__,
+                            src, dst, value, port_id, port_id,
+                            destinationQueues[port_id].size());
+        stats.updateQueueLength.sample(destinationQueues[port_id].size());
+        if (!nextUpdatePushEvent.scheduled()) {
+            schedule(nextUpdatePushEvent, nextCycle());
+        }
+        return true;
+    }
+    DPRINTF(PushEngine, "%s: DestinationQueue for pot %d is blocked.\n",
+                            __func__, port_id);
+    return false;
+}
+
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+PushEngine::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < outPorts.size(); i++) {
+        if (outPorts[i].blocked()) {
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, i);
+            continue;
+        }
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, i);
+        if (destinationQueues[i].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for "
+                                "port %d is empty.\n", __func__, i);
+            continue;
+        }
+        Addr dst;
+        Tick entrance_tick;
+        std::tie(dst, entrance_tick) = destinationQueues[i].front();
+        Addr src;
+        uint32_t value;
+        std::tie(src, value) = sourceAndValueMaps[i][dst];
+
+        PacketPtr pkt = createUpdatePacket<uint32_t>(dst, value);
+        outPorts[i].sendPacket(pkt);
+        destinationQueues[i].pop_front();
+        sourceAndValueMaps[i].erase(dst);
+        DPRINTF(PushEngine, "%s: Sent Update{src: %lu, dst:%lu, value: %u} to "
+                    "port %d. Respective queue size is %d.\n", __func__,
+                    src, dst, value, i, destinationQueues[i].size());
+        if (destinationQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+        stats.numUpdates++;
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
+PushEngine::PushStats::PushStats(PushEngine& _push):
+    statistics::Group(&_push), push(_push),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Number of propagate operations done."),
+    ADD_STAT(updateQueueFull, statistics::units::Count::get(),
+             "Number of times the update queue returns false."),
+    ADD_STAT(numNetBlocks, statistics::units::Count::get(),
+             "Number of updates blocked by network."),
+    // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+    //          "Number of cycles PushEngine has been idle."),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of updates sent to the network."),
+    ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(),
+             "Number of wasteful edges read from edge memory."),
+    ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
+                                    statistics::units::Second>::get(),
+             "Traversed Edges Per Second."),
+    ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the edgePointerQueue."),
+    ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the metaEdgeQueue."),
+    ADD_STAT(edgeQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the metaEdgeQueue."),
+    ADD_STAT(updateQueueLength, statistics::units::Count::get(),
+             "Histogram of the length of updateQueues."),
+    ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
+             "Histogram of number of propagates sent.")
+{
+}
+
+void
+PushEngine::PushStats::regStats()
+{
+    using namespace statistics;
+
+    TEPS = numPropagates / simSeconds;
+
+    edgePointerQueueLatency.init(64);
+    edgePointerQueueLength.init(64);
+    edgeQueueLatency.init(64);
+    edgeQueueLength.init(64);
+    updateQueueLength.init(64);
+    numPropagatesHist.init(1 + push.params().max_propagates_per_cycle);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
new file mode 100644
index 0000000000..7170d2d22e
--- /dev/null
+++ b/src/accl/graph/sega/push_engine.hh
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include <unordered_map>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "params/PushEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class PushEngine : public BaseMemoryEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner, PortID id) :
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class EdgeReadInfoGen {
+      private:
+        Addr _src;
+        uint32_t _delta;
+
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+
+      public:
+        EdgeReadInfoGen(Addr src, uint32_t delta, Addr start,
+                        Addr end, size_t step, size_t atom):
+                        _src(src), _delta(delta), _start(start),
+                        _end(end), _step(step), _atom(atom)
+        {}
+
+        Addr src() { return _src; }
+        uint32_t delta() { return _delta; }
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (aligned_addr + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
+        bool done() { return (_start >= _end); }
+    };
+
+    struct PushInfo {
+        Addr src;
+        uint32_t value;
+        Addr offset;
+        int numElements;
+    };
+
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    bool _running;
+    Tick lastIdleEntranceTick;
+
+    AddrRangeList localAddrRange;
+
+    int numPendingPulls;
+    int edgePointerQueueSize;
+    std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
+    std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
+
+    int onTheFlyMemReqs;
+    int edgeQueueSize;
+    int examineWindow;
+    int maxPropagatesPerCycle;
+    std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
+
+    int updateQueueSize;
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+    bool enqueueUpdate(Addr src, Addr dst, uint32_t value);
+    std::vector<std::deque<std::tuple<Addr, Tick>>> destinationQueues;
+    std::vector<std::unordered_map<Addr, std::tuple<Addr, uint32_t>>> sourceAndValueMaps;
+    AddrRangeMap<PortID> portAddrMap;
+    std::vector<ReqPort> outPorts;
+
+    bool vertexSpace();
+    bool workLeft();
+
+    EventFunctionWrapper nextVertexPullEvent;
+    void processNextVertexPullEvent();
+
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
+
+    EventFunctionWrapper nextPropagateEvent;
+    void processNextPropagateEvent();
+
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
+    struct PushStats : public statistics::Group
+    {
+      PushStats(PushEngine& push);
+
+      void regStats() override;
+
+      PushEngine &push;
+
+      statistics::Scalar numMemoryBlocks;
+      statistics::Scalar numPropagates;
+      statistics::Scalar updateQueueFull;
+      statistics::Scalar numNetBlocks;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar numUpdates;
+      statistics::Scalar numWastefulEdgesRead;
+
+      statistics::Formula TEPS;
+
+      statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgePointerQueueLength;
+      statistics::Histogram edgeQueueLatency;
+      statistics::Histogram edgeQueueLength;
+      statistics::Histogram updateQueueLength;
+      statistics::Histogram numPropagatesHist;
+    };
+
+    PushStats stats;
+
+  protected:
+    virtual void recvMemRetry();
+    virtual bool handleMemResp(PacketPtr pkt);
+
+  public:
+    PARAMS(PushEngine);
+    PushEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerMPU(MPU* mpu);
+
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+    void start();
+    bool running() { return _running; }
+    void recvVertexPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait);
+
+    void recvReqRetry();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
new file mode 100644
index 0000000000..e26cc06645
--- /dev/null
+++ b/src/accl/graph/sega/router_engine.cc
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/router_engine.hh"
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "base/trace.hh"
+#include "sim/stats.hh"
+#include "debug/RouterEngine.hh"
+
+namespace gem5
+{
+RouterEngine::RouterEngine(const Params &params):
+  ClockedObject(params),
+  system(params.system),
+  gptQSize(params.gpt_queue_size),
+  gpnQSize(params.gpn_queue_size),
+  emptyQueues(false),
+  routerLatency(params.router_latency),
+  start(0),
+  sampleTime(params.sample_time),
+  tokens(params.token),
+  nextGPTGPNEvent([this] { processNextGPTGPNEvent(); }, name()),
+  nextInternalRequestEvent(
+                        [this] { processNextInternalRequestEvent(); }, name()),
+  nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()),
+  nextExternalRequestEvent(
+                        [this] { processNextExternalRequestEvent(); }, name()),
+  nextTrafficTrackEvent(
+                        [this] { processNextTrafficTrackEvent(); }, name()),
+  stats(*this)
+{
+
+    for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
+        gptReqPorts.emplace_back(
+                    name() + ".gpt_req_side" + std::to_string(i), this, i);
+        // m_newTraffic.emplace_back(new statistics::Histogram());
+        // m_newTraffic[i]->init(10);
+    }
+    for (int i = 0; i < params.port_gpt_resp_side_connection_count; ++i) {
+        gptRespPorts.emplace_back(
+                    name() + ".gpt_resp_side" + std::to_string(i), this, i);
+    }
+    for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
+        gpnReqPorts.emplace_back(
+                    name() + ".gpn_req_side" + std::to_string(i), this, i);
+    }
+    for (int i = 0; i < params.port_gpn_resp_side_connection_count; ++i) {
+        gpnRespPorts.emplace_back(
+                    name() + ".gpn_resp_side" + std::to_string(i), this, i);
+    }
+    for (int i = 0; i <params.port_gpt_req_side_connection_count; ++i) {
+        externalLatency[i] = curCycle();
+    }
+    for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
+        internalLatency[i] = curCycle();
+        tokenVector.emplace_back(0);
+        inFlightTraffic.emplace_back(0);
+        sample.emplace_back(0);
+    }
+}
+
+void
+RouterEngine::registerCenteralController(
+                                    CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+AddrRangeList
+RouterEngine::GPTRespPort::getAddrRanges() const
+{
+    return owner->getGPNRanges();
+}
+
+AddrRangeList
+RouterEngine::GPNRespPort::getAddrRanges() const
+{
+    return owner->getGPTRanges();
+}
+
+AddrRangeList
+RouterEngine::getGPNRanges()
+{
+    AddrRangeList ret;
+    for (auto &gpnPort : gpnReqPorts) {
+        for (auto &addr_range : gpnPort.getAddrRanges()) {
+            ret.push_back(addr_range);
+        }
+    }
+    // for(auto i = routerAddrMap.begin(); i != routerAddrMap.end(); ++i) {
+    //     ret.push_back(i->second);
+    // }
+    return ret;
+}
+
+AddrRangeList
+RouterEngine::getGPTRanges()
+{
+    AddrRangeList ret;
+    for (auto &gptPort : gptReqPorts) {
+        for (auto &addr_range : gptPort.getAddrRanges()) {
+            ret.push_back(addr_range);
+            // std::cout<<"HERE:"<<&addr_range<<std::endl;
+        }
+    }
+    return ret;
+}
+
+void
+RouterEngine::init()
+{
+    for (int i = 0; i < gptReqPorts.size(); i++) {
+        gptAddrMap[gptReqPorts[i].id()] = gptReqPorts[i].getAddrRanges();
+    }
+}
+
+void
+RouterEngine::startup()
+{
+    for (int i = 0; i < gpnReqPorts.size(); i++) {
+        routerAddrMap[gpnReqPorts[i].id()] = gpnReqPorts[i].getAddrRanges();
+        tokenVector[i] = tokens;
+    }
+}
+
+bool
+RouterEngine::done()
+{
+    bool emptygptReq = true;
+    bool emptygpnReq = true;
+    bool emptygptResp = true;
+    bool emptygpnResp = true;
+    bool empty = true;
+    for (auto &q: gptReqQueues) {
+        emptygptReq &= q.second.empty();
+    }
+
+    for (auto &q: gpnReqQueues) {
+        emptygpnReq &= q.second.empty();
+    }
+
+    for (auto &q: gptRespQueues) {
+        emptygptResp &= q.second.empty();
+    }
+
+    for (auto &q: gpnRespQueues) {
+        emptygpnResp &= q.second.empty();
+    }
+
+    empty = emptygptReq & emptygpnReq & emptygptResp & emptygpnResp;
+    DPRINTF(RouterEngine, "%s: emptygptReq: %d, emptygpnReq: %d, "
+                "emptygptResp: %d, emptygpnResp: %d.\n", __func__, emptygptReq,
+                                    emptygpnReq, emptygptResp, emptygpnResp);
+    return empty;
+}
+
+Port&
+RouterEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "gpt_req_side") {
+        return gptReqPorts[idx];
+    } else if (if_name == "gpt_resp_side") { 
+        return gptRespPorts[idx];
+    } else if (if_name == "gpn_req_side") {
+        return gpnReqPorts[idx];
+    } else if (if_name == "gpn_resp_side") {
+        return gpnRespPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+bool
+RouterEngine::GPTReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("Not implemented yet!");
+    return 0;
+}
+
+void
+RouterEngine::GPTReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(RouterEngine, "%s: ReqPort %d received a reqRetry. "
+                "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(RouterEngine, "%s: blockedPacket sent successfully.\n",
+                                                                    __func__);
+        owner->recvReqRetry();
+    }
+}
+
+bool
+RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("Not implemented yet!");
+    return 0;
+}
+
+void
+RouterEngine::GPNReqPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(blockedPacket != nullptr);
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+
+    sendPacket(pkt);
+
+    owner->wakeUpInternal();
+}
+
+void
+RouterEngine::GPNReqPort::sendPacket(PacketPtr pkt) {
+    panic_if(blocked(), "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt)) {
+        DPRINTF(RouterEngine, "%s: The GPNReq port is blocked.\n", __func__);
+        blockedPacket = pkt;
+    }
+}
+
+void
+RouterEngine::GPTReqPort::sendPacket(PacketPtr pkt) {
+    panic_if(blocked(), "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt)) {
+        DPRINTF(RouterEngine, "%s: The GPTReq port is blocked.\n", __func__);
+        blockedPacket = pkt;
+    }
+}
+
+Tick 
+RouterEngine::GPTRespPort::recvAtomic(PacketPtr pkt) {
+    panic("Not implemented yet!");
+}
+
+void
+RouterEngine::GPTRespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+bool
+RouterEngine::GPTRespPort::recvTimingReq(PacketPtr pkt) {
+    if (!owner->handleRequest(id(), pkt)) {
+        DPRINTF(RouterEngine, "%s: Router Rejected the packet %s.\n",
+                    __func__, pkt->getAddr());
+        needSendRetryReq = true;
+        return false;
+    }
+    return true;
+}
+
+void
+RouterEngine::recvReqRetry()
+{
+    DPRINTF(RouterEngine, "%s: Received a reqRetry.\n", __func__);
+    if (!nextExternalRequestEvent.scheduled()) {
+        schedule(nextExternalRequestEvent, nextCycle());
+    }
+}
+
+bool
+RouterEngine::handleRequest(PortID portId, PacketPtr pkt)
+{
+    auto &queue = gptReqQueues[portId];
+    bool accepted = false;
+    if (queue.size() < gptQSize) {
+        DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] size is: %d.\n",
+                                            __func__, portId, queue.size());
+        gptReqQueues[portId].push(pkt);
+        accepted = true;
+    } else {
+         DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] is full.\n",
+                                                            __func__, portId);
+        accepted = false;
+    }
+
+    if (accepted && (!nextGPTGPNEvent.scheduled())) {
+        schedule(nextGPTGPNEvent, nextCycle());
+    }
+    DPRINTF(RouterEngine, "%s: GPT sent req to router: accepted: %d.\n",
+                                                        __func__, accepted);
+    return accepted;
+}
+
+void
+RouterEngine::processNextGPTGPNEvent()
+{
+    bool found = false;
+    bool queues_none_empty = false;
+    DPRINTF(RouterEngine, "%s: Trying to send a request from GPT to GPN.\n",
+                                                                    __func__);
+    for (auto &queue: gptReqQueues) {
+        if (!queue.second.empty()) {
+            PacketPtr pkt = queue.second.front();
+            Addr pkt_addr = pkt->getAddr();
+            for (int i = 0; i < gpnReqPorts.size(); i++) {
+                AddrRangeList addr_list = routerAddrMap[gpnReqPorts[i].id()];
+                if ((contains(addr_list, pkt_addr))) {
+                    if (gpnRespQueues[gpnReqPorts[i].id()].size() < gpnQSize) {
+                        gpnRespQueues[gpnReqPorts[i].id()].push(pkt);
+                        DPRINTF(RouterEngine, "%s: Pushing the pkt %s to  "
+                                "gpnRespQueue[%d]. gpnRespQueue size is: %d\n",
+                                __func__, pkt->getAddr(), i,
+                                gpnRespQueues[gpnReqPorts[i].id()].size());
+                        queue.second.pop();
+                        DPRINTF(RouterEngine, "%s: gptReqQueue size is: %d.\n",
+                                                __func__, queue.second.size());
+                        found |= true;
+                        if ((!nextInternalRequestEvent.scheduled())) {
+                            schedule(nextInternalRequestEvent, nextCycle());
+                        }
+                    // queue is full
+                    } else {
+                         DPRINTF(RouterEngine, "%s: gpnRespQueue[%d] is full."
+                            "\n", __func__, pkt->getAddr(), i);
+                        found |= false;
+                    }
+                }
+            }
+        }
+        if (found) {
+            checkGPTRetryReq();
+        }
+    }
+
+    for (auto &queue: gptReqQueues)
+    {
+        if (!queue.second.empty()) {
+            queues_none_empty = true;
+        }
+    }
+
+    if (queues_none_empty) {
+        DPRINTF(RouterEngine, "%s: The gptReqQueues is not empty.\n",
+                                                                    __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: The gptReqQueues is empty.\n", __func__);
+    }
+
+    if (queues_none_empty && (!nextGPTGPNEvent.scheduled())) {
+        schedule(nextGPTGPNEvent, nextCycle());
+    }
+}
+
+void
+RouterEngine::processNextInternalRequestEvent()
+{
+    DPRINTF(RouterEngine, "%s: Sending a request between two routers.\n",
+                                                                    __func__);
+    bool none_empty_queue = false;
+    int id;
+    for (auto &queue: gpnRespQueues) {
+        if (!queue.second.empty()) {
+            id = gpnReqPorts[queue.first].id();
+            if (!gpnReqPorts[queue.first].blocked() && (tokenVector[id] != 0)) {
+                if  ((curCycle() - 
+                    internalLatency[gpnReqPorts[queue.first].id()]) 
+                    < routerLatency) {
+                    continue;
+                }
+                PacketPtr pkt = queue.second.front();
+                DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n",
+                    __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
+                gpnReqPorts[queue.first].sendPacket(pkt);
+                inFlightTraffic[queue.first]++;
+                queue.second.pop();
+                internalLatency[gpnReqPorts[queue.first].id()] = curCycle();
+                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                stats.totalInternalTraffic[gpnReqPorts[queue.first].id()] += 
+                                                                sizeof(pkt);
+                tokenVector[id]--;
+            } else if (tokenVector[id] == 0) {
+                DPRINTF(RouterEngine, "%s: Rand out of tokens for port id %d.\n",
+                    __func__, id);
+                stats.bandwidthBlocked[id]++;
+            } else {
+                DPRINTF(RouterEngine, "%s: port id %d is blocked.\n",
+                    __func__, gpnReqPorts[queue.first].id());
+                stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
+            }
+        }
+    }
+
+    for (auto &queue: gpnRespQueues) {
+        if (!queue.second.empty()) {
+            none_empty_queue = true;
+            break;
+        }
+    }
+
+    if (none_empty_queue) {
+        DPRINTF(RouterEngine, "%s: The gpnRespQueues is not empty.\n",
+                                                                    __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: The gpnRespQueues is empty.\n", __func__);
+    }
+
+    Tick next_schedule = nextCycle() + cyclesToTicks(routerLatency);
+    for (auto itr = internalLatency.begin(); 
+            itr != internalLatency.end();
+            itr++)
+    {
+        if (cyclesToTicks(itr->second + routerLatency) <  next_schedule) {
+            if ((itr->second + routerLatency) <=  curCycle()) {
+                next_schedule =  nextCycle();
+                break;
+            } else {
+                next_schedule = std::min(
+                                    cyclesToTicks(itr->second + routerLatency),
+                                    next_schedule);
+            }
+        } 
+    }
+
+
+    if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) {
+        schedule(nextInternalRequestEvent, next_schedule);
+    }
+
+    if(!nextTrafficTrackEvent.scheduled() && (start == 0)) {
+        start = 1;
+        schedule(nextTrafficTrackEvent, next_schedule);
+    }
+}
+
+void
+RouterEngine::processNextTrafficTrackEvent()
+{
+    for (auto &queue: gpnRespQueues) {
+        stats.internalTrafficHist[queue.first]->sample(inFlightTraffic[queue.first]);
+        // stats.internalTrafficVector[queue.first][sample[queue.first]] = inFlightTraffic[queue.first];
+        sample[queue.first]++;
+        inFlightTraffic[queue.first] = 0;
+    }
+
+    for (int i = 0; i < gpnReqPorts.size(); i++) {
+        tokenVector[i] = tokens;
+    }
+
+    if(!nextTrafficTrackEvent.scheduled()) {
+        schedule(nextTrafficTrackEvent, curTick() + sampleTime);
+    }
+}
+
+void
+RouterEngine::GPTRespPort::recvFunctional(PacketPtr pkt) {
+    panic("Not implemented yet!");
+}
+
+void
+RouterEngine::GPTRespPort::recvRespRetry() {
+    panic("Not implemented yet!");
+}
+
+Tick 
+RouterEngine::GPNRespPort::recvAtomic(PacketPtr pkt) {
+    panic("Not implemented yet!");
+}
+
+void
+RouterEngine::GPNRespPort::checkRetryReq() {
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+bool
+RouterEngine::GPNRespPort::recvTimingReq(PacketPtr pkt) {
+    if (!owner->handleRemoteRequest(id(), pkt)) {
+        DPRINTF(RouterEngine, "%s: Router Rejected the packet %s.\n",
+                    __func__, pkt->getAddr());
+        needSendRetryReq = true;
+        return false;
+    }
+    return true;
+}
+
+bool
+RouterEngine::handleRemoteRequest(PortID id, PacketPtr pkt) {
+    bool accepted = false;
+    if (gpnReqQueues[id].size() < gpnQSize) {
+        gpnReqQueues[id].push(pkt);
+        accepted = true;
+    } else {
+        accepted = false;
+    }
+
+    if (accepted && (!nextGPNGPTEvent.scheduled())) {
+        schedule(nextGPNGPTEvent, nextCycle());
+    }
+
+    DPRINTF(RouterEngine, "%s: The remote packet: %s is accepted: %d.\n",
+                                        __func__, pkt->getAddr(), accepted);
+    return accepted;
+}
+
+void
+RouterEngine::processNextGPNGPTEvent()
+{
+    bool found = false;
+    bool queues_none_empty = false;
+    for (auto &queue: gpnReqQueues) {
+        if (!queue.second.empty()) {
+            PacketPtr pkt = queue.second.front();
+            Addr pkt_addr = pkt->getAddr();
+            for (int i = 0; i < gptReqPorts.size(); i++) {
+                AddrRangeList addr_list = gptAddrMap[gptReqPorts[i].id()];
+                if ((contains(addr_list, pkt_addr))) {
+                    if (gptRespQueues[gptReqPorts[i].id()].size() < gptQSize) {
+                        gptRespQueues[gptReqPorts[i].id()].push(pkt);
+                        DPRINTF(RouterEngine, "%s: The size of "
+                                    "gptRespQueues[%d] is %d.\n", __func__, i,
+                                    gptRespQueues[gptReqPorts[i].id()].size());
+                        DPRINTF(RouterEngine,
+                                    "%s: Sending pkt %s to GPT %d.\n",
+                                    __func__, pkt->getAddr(), i);
+                        queue.second.pop();
+                        found |= true;
+                        if ((!nextExternalRequestEvent.scheduled())) {
+                            schedule(nextExternalRequestEvent, nextCycle());
+                        }
+                    } else {
+                        DPRINTF(RouterEngine,
+                                    "%s: gptRespQueues[%d] is full.\n",
+                                    __func__, pkt->getAddr(), i);
+                        found |= false;
+                    }
+                }
+            }
+        }
+        if (found) {
+            checkGPNRetryReq();
+        }
+    }
+
+    for (auto &queue: gpnReqQueues) {
+        if (!queue.second.empty()) {
+            queues_none_empty = true;
+        }
+    }
+
+    if (queues_none_empty) {
+        DPRINTF(RouterEngine, "%s: gpnReqQueues is not empty.\n", __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: gpnReqQueues is empty.\n", __func__);
+    }
+
+    if (queues_none_empty && (!nextGPNGPTEvent.scheduled())) {
+        schedule(nextGPNGPTEvent, nextCycle());
+    }
+}
+
+void
+RouterEngine::processNextExternalRequestEvent()
+{
+    DPRINTF(RouterEngine, "%s: Sending the request to the GPT.\n", __func__);
+    bool none_empty_queue = false;
+    for (auto &queue: gptRespQueues) {
+        if (!queue.second.empty()) {
+            if (!gptReqPorts[queue.first].blocked()) {
+                if ((curCycle() - 
+                    externalLatency[gptReqPorts[queue.first].id()]) 
+                    < routerLatency) {
+                    continue;
+                }
+                stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++;
+                PacketPtr pkt = queue.second.front();
+                DPRINTF(RouterEngine, "%s: gptRespQueues[%d] is not empty. "
+                        "the size is: %d.\n", __func__,
+                         gptReqPorts[queue.first].id() ,queue.second.size());
+                DPRINTF(RouterEngine, "%s: Sending packet %s to GPT: %d.\n",
+                    __func__, pkt->getAddr(),gptReqPorts[queue.first].id());
+                gptReqPorts[queue.first].sendPacket(pkt);
+                queue.second.pop();
+                externalLatency[gptReqPorts[queue.first].id()] = curCycle();
+            }
+            else {
+                stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
+            }
+        }
+    }
+
+    for (auto &queue: gptRespQueues) {
+        DPRINTF(RouterEngine, "%s: gptRespQueues[%d] size is: %d.\n", __func__,
+                        gptReqPorts[queue.first].id() ,queue.second.size());
+        if (!queue.second.empty()) {
+            none_empty_queue = true;
+            break;
+        }
+    }
+
+    if (none_empty_queue) {
+        DPRINTF(RouterEngine, "%s: The gptRespQueues is not empty.\n",
+                                                                    __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: The gptRespQueues is empty.\n", __func__);
+    }
+
+    Tick next_schedule = cyclesToTicks(curCycle() + routerLatency);
+    for (auto itr = externalLatency.begin(); 
+        itr != externalLatency.end(); itr++)
+    {
+        if (cyclesToTicks(itr->second + routerLatency) <  next_schedule) {
+            if ((itr->second + routerLatency) <=  curCycle()) {
+                next_schedule =  nextCycle();
+                break;
+            } else {
+                next_schedule = std::min(
+                                    cyclesToTicks(itr->second + routerLatency),
+                                    next_schedule);
+            }
+        } 
+    }
+
+    if (none_empty_queue) {
+        if (!nextExternalRequestEvent.scheduled()) {
+            schedule(nextExternalRequestEvent, next_schedule);
+        }
+    }
+}
+
+void
+RouterEngine::GPNRespPort::recvFunctional(PacketPtr pkt)
+{
+    panic("Not implemented yet!");
+}
+
+void
+RouterEngine::GPNRespPort::recvRespRetry()
+{
+    panic("Not implemented yet!");
+}
+
+void
+RouterEngine::wakeUpInternal()
+{
+    if ((!nextInternalRequestEvent.scheduled())) {
+        for (auto &queue: gpnRespQueues) {
+            if (!queue.second.empty()) {
+                schedule(nextInternalRequestEvent, nextCycle());
+                return;
+            }
+        }
+    }
+}
+
+void
+RouterEngine::checkGPTRetryReq()
+{
+    for (int i = 0; i < gptRespPorts.size(); i++) {
+        gptRespPorts[i].checkRetryReq();
+    }
+}
+
+void
+RouterEngine::checkGPNRetryReq()
+{
+    for (int i = 0; i < gpnRespPorts.size(); i++) {
+        gpnRespPorts[i].checkRetryReq();
+    }
+}
+
+RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
+    : statistics::Group(&_router),
+    router(_router),
+    ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(),
+             "Number of packets blocked between routers."),
+    ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(),
+             "Number of external packets blocked."),
+    ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
+             "Number of packet passed between routers."),
+    ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
+             "Number of external packets passed."),
+    ADD_STAT(bandwidthBlocked, statistics::units::Count::get(),
+             "Number of packets blocked due to lack of."),
+    ADD_STAT(totalInternalTraffic, statistics::units::Count::get(),
+             "Total traffic sent from the internal port")
+            //  ,
+    // ADD_STAT(internalTrafficVector, statistics::units::Count::get(),
+    //          "Number of requests sent in internal link")
+{}
+
+void
+RouterEngine::RouterEngineStat::regStats()
+{
+    using namespace statistics;
+
+    internalBlockedTraffic.init(router.gpnReqPorts.size());
+    externalBlockedTraffic.init(router.gptReqPorts.size());
+    internalAcceptedTraffic.init(router.gpnReqPorts.size());
+    externalAcceptedTraffic.init(router.gptReqPorts.size());
+    bandwidthBlocked.init(router.gpnReqPorts.size());
+    totalInternalTraffic.init(router.gpnReqPorts.size());
+    // internalTrafficVector.init(router.gpnReqPorts.size(), 6000);
+
+    for (uint32_t i = 0; i < router.gpnReqPorts.size(); ++i) {
+        internalTrafficHist.push_back(new statistics::Histogram(this));
+        internalTrafficHist[i]
+            ->init(20000)
+            .name(csprintf("internal_traffic_hist_%i",i))
+            .desc("")
+            .flags(nozero);
+
+        internalPortBW.push_back(new statistics::Formula(this,
+        csprintf("average_internal_BW_%d", i).c_str(),
+        "Internal BW (GB/s)"));
+
+        *internalPortBW[i] =
+            totalInternalTraffic[i] / (simSeconds*1e9);
+    }
+}
+}// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
new file mode 100644
index 0000000000..5c06ecc862
--- /dev/null
+++ b/src/accl/graph/sega/router_engine.hh
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
+
+#include <queue>
+
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/RouterEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+class CenteralController;
+class RouterEngine : public ClockedObject
+{
+  private:
+    class GPTReqPort : public RequestPort
+    {
+      private:
+        RouterEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        GPTReqPort(const std::string& name, RouterEngine* owner, PortID id) :
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class GPNReqPort : public RequestPort
+    {
+      private:
+        RouterEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        GPNReqPort(const std::string& name, RouterEngine* owner, PortID id) :
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class GPTRespPort : public ResponsePort
+    {
+      private:
+        RouterEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        GPTRespPort(const std::string& name, RouterEngine* owner, PortID id):
+          ResponsePort(name, owner),
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class GPNRespPort : public ResponsePort
+    {
+      private:
+        RouterEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        GPNRespPort(const std::string& name, RouterEngine* owner, PortID id):
+          ResponsePort(name, owner),
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    System* system;
+    CenteralController* centeralController;
+    bool handleRequest(PortID portId, PacketPtr pkt);
+    bool handleRemoteRequest(PortID portId, PacketPtr pkt);
+    void wakeUpInternal();
+    void wakeUpExternal();
+    void checkRetryExternal();
+    void checkRetryInternal();
+    std::vector<GPTReqPort> gptReqPorts;
+    std::vector<GPTRespPort> gptRespPorts;
+
+    std::vector<GPNReqPort> gpnReqPorts;
+    std::vector<GPNRespPort> gpnRespPorts;
+
+
+    std::unordered_map<PortID, AddrRangeList> gptAddrMap;
+    std::unordered_map<PortID, AddrRangeList> routerAddrMap;
+
+    std::unordered_map<PortID, std::queue<PacketPtr>> gptReqQueues;
+    std::unordered_map<PortID, std::queue<PacketPtr>> gpnRespQueues;
+
+    std::unordered_map<PortID, std::queue<PacketPtr>> gptRespQueues;
+    std::unordered_map<PortID, std::queue<PacketPtr>> gpnReqQueues;
+
+    std::unordered_map<PortID, Cycles> externalLatency;
+    std::unordered_map<PortID, Cycles> internalLatency;
+    std::vector<int> inFlightTraffic;
+    std::vector<int> tokenVector;
+    std::vector<int> sample;
+
+    const uint32_t gptQSize;
+    const uint32_t gpnQSize;
+    bool emptyQueues;
+    const Cycles routerLatency;
+    int start;
+    Tick sampleTime;
+    int tokens;
+
+    EventFunctionWrapper nextGPTGPNEvent;
+    void processNextGPTGPNEvent();
+
+    EventFunctionWrapper nextInternalRequestEvent;
+    void processNextInternalRequestEvent();
+
+    EventFunctionWrapper nextGPNGPTEvent;
+    void processNextGPNGPTEvent();
+
+    EventFunctionWrapper nextExternalRequestEvent;
+    void processNextExternalRequestEvent();
+
+    EventFunctionWrapper nextTrafficTrackEvent;
+    void processNextTrafficTrackEvent();
+
+    struct RouterEngineStat : public statistics::Group
+    {
+      RouterEngineStat(RouterEngine &push);
+
+      void regStats() override;
+
+      RouterEngine &router;
+
+      statistics::Vector internalBlockedTraffic;
+      statistics::Vector externalBlockedTraffic;
+      statistics::Vector internalAcceptedTraffic;
+      statistics::Vector externalAcceptedTraffic;
+      statistics::Vector bandwidthBlocked;
+      statistics::Vector totalInternalTraffic;
+    //   statistics::Vector2d internalTrafficVector;
+      std::vector<statistics::Histogram *> internalTrafficHist;
+      std::vector<statistics::Formula *> internalPortBW;
+    };
+    RouterEngineStat stats;
+  public:
+    PARAMS(RouterEngine);
+    RouterEngine(const Params &params);
+    void registerCenteralController(CenteralController* centeral_controller);
+    virtual void init() override;
+    virtual void startup() override;
+    Port& getPort(const std::string& if_name,
+              PortID idx = InvalidPortID) override;
+
+    AddrRangeList getGPNRanges();
+    AddrRangeList getGPTRanges();
+    void recvReqRetry();
+
+    void checkGPTRetryReq();
+    void checkGPNRetryReq();
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md
new file mode 100644
index 0000000000..203c47cf02
--- /dev/null
+++ b/src/accl/graph/sega/state_machine.md
@@ -0,0 +1 @@
+# CoalesceEngine Block state machine
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
new file mode 100644
index 0000000000..8e5ccc9ebe
--- /dev/null
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#include "accl/graph/sega/mpu.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "debug/WLEngine.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams& params):
+    BaseReduceEngine(params),
+    updateQueueSize(params.update_queue_size),
+    examineWindow(params.examine_window),
+    maxReadsPerCycle(params.rd_per_cycle),
+    maxReducesPerCycle(params.reduce_per_cycle),
+    maxWritesPerCycle(params.wr_per_cycle),
+    registerFileSize(params.register_file_size),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextWriteEvent([this] { processNextWriteEvent(); }, name()),
+    nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
+    stats(*this)
+{
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+WLEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_ports") {
+        return inPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::init()
+{
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
+}
+
+void
+WLEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+AddrRangeList
+WLEngine::getAddrRanges()
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::checkRetryReq()
+{
+    std::vector<int> random_shuffle;
+    for (int i = 0; i < inPorts.size(); i++) {
+        random_shuffle.push_back(i);
+    }
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::shuffle(random_shuffle.begin(), random_shuffle.end(), gen);
+
+    for (int i = 0; i < inPorts.size(); i++) {
+        inPorts[random_shuffle[i]].checkRetryReq();
+    }
+}
+
+bool
+WLEngine::done()
+{
+    return registerFile.empty() && updateQueue.empty();
+}
+
+bool
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
+{
+    Addr update_addr = pkt->getAddr();
+    uint32_t update_value = pkt->getLE<uint32_t>();
+
+    if (valueMap.find(update_addr) != valueMap.end()) {
+        assert((updateQueueSize == 0) ||
+                (updateQueue.size() <= updateQueueSize));
+        DPRINTF(WLEngine, "%s: Found an already queued update to %u. ",
+                            "Current value is: %u.\n", __func__,
+                            update_addr, valueMap[update_addr]);
+        valueMap[update_addr] =
+                graphWorkload->reduce(update_value, valueMap[update_addr]);
+        stats.numIncomingUpdates++;
+        stats.updateQueueCoalescions++;
+    } else {
+        assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
+        if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+            return false;
+        } else {
+            updateQueue.emplace_back(update_addr, curTick());
+            valueMap[update_addr] = update_value;
+            stats.numIncomingUpdates++;
+            DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+            DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+        }
+    }
+
+    // delete the packet since it's not needed anymore.
+    delete pkt;
+
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+WLEngine::processNextReadEvent()
+{
+    std::deque<std::tuple<Addr, Tick>> temp_queue;
+    for (int i = 0; i < examineWindow; i++) {
+        if (updateQueue.empty()) {
+            break;
+        }
+        temp_queue.push_back(updateQueue.front());
+        updateQueue.pop_front();
+    }
+
+    int num_reads = 0;
+    int num_popped = 0;
+    int num_tries = 0;
+    int max_visits = temp_queue.size();
+    while (true) {
+        Addr update_addr;
+        Tick enter_tick;
+        std::tie(update_addr, enter_tick) = temp_queue.front();
+
+        uint32_t update_value = valueMap[update_addr];
+        DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+            "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
+        if ((registerFile.find(update_addr) == registerFile.end())) {
+            DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                                "in registerFile.\n", __func__, update_addr);
+            if (registerFile.size() < registerFileSize) {
+                DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                                "registerFile.\n", __func__);
+                ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+                if (read_status == ReadReturnStatus::ACCEPT) {
+                    DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                                "request to addr: %lu.\n", __func__, update_addr);
+                    registerFile[update_addr] = std::make_tuple(RegisterState::PENDING_READ, update_value);
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    temp_queue.pop_front();
+                    valueMap.erase(update_addr);
+                    num_reads++;
+                    num_popped++;
+                    stats.updateQueueLatency.sample(
+                            (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                    DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, temp_queue.size(), updateQueueSize);
+                    DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, updateQueue.size(), updateQueueSize);
+                    vertexReadTime[update_addr] = curTick();
+                } else {
+                    if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                            "Rolling the update.\n", __func__);
+                        stats.numUpdateRolls++;
+                    } else {
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject with no roll "
+                        "from cache. Rolling the update anyway.\n", __func__);
+                    }
+                }
+            } else {
+                DPRINTF(WLEngine, "%s: There are no free registers "
+                        "available in the registerFile.\n", __func__);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, enter_tick);
+                stats.registerShortage++;
+            }
+        } else {
+            DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__,
+                update_addr, update_addr, std::get<1>(registerFile[update_addr]));
+            RegisterState state = std::get<0>(registerFile[update_addr]);
+            if (state == RegisterState::PENDING_WRITE) {
+                // NOTE: If it's pending write, let it be written.
+                DPRINTF(WLEngine, "%s: Respective register for addr: "
+                        "%lu is pending a write to the cache. Rolling "
+                        "the update.\n", __func__, update_addr);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, enter_tick);
+            } else {
+                uint32_t curr_value = std::get<1>(registerFile[update_addr]);
+                uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
+                registerFile[update_addr] = std::make_tuple(state, new_value);
+                DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                            " registerFile. registerFile[%lu] = %u.\n", __func__,
+                            update_value, update_addr, std::get<1>(registerFile[update_addr]));
+                stats.registerFileCoalescions++;
+                temp_queue.pop_front();
+                valueMap.erase(update_addr);
+                num_popped++;
+                stats.updateQueueLatency.sample(
+                                (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                                    "from updateQueue. updateQueue.size = %d. "
+                                    "updateQueueSize = %d.\n", __func__, update_addr,
+                                    update_value, updateQueue.size(), updateQueueSize);
+                DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+            }
+        }
+
+        num_tries++;
+        if (num_reads >= maxReadsPerCycle) {
+            if (!temp_queue.empty()) {
+                stats.numReadPortShortage++;
+            }
+            break;
+        }
+        if (num_tries >= max_visits) {
+            break;
+        }
+        if (temp_queue.empty()) {
+            break;
+        }
+    }
+
+    while (!temp_queue.empty()) {
+        updateQueue.push_front(temp_queue.back());
+        temp_queue.pop_back();
+    }
+    if (num_popped > 0) {
+        checkRetryReq();
+    }
+    if (!updateQueue.empty() && !nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    assert(workListFile.size() <= registerFileSize);
+    assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_READ);
+
+    workListFile[addr] = wl;
+    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                graphWorkload->printWorkListItem(wl), workListFile.size());
+    DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                graphWorkload->printWorkListItem(wl), workListFile.size());
+
+    uint32_t value = std::get<1>(registerFile[addr]);
+    registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value);
+    toReduce.push_back(addr);
+
+    stats.vertexReadLatency.sample(
+        ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
+    vertexReadTime.erase(addr);
+
+    if (!nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextReduceEvent()
+{
+    int num_reduces = 0;
+    while (true) {
+        Addr addr = toReduce.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE);
+        uint32_t update_value = std::get<1>(registerFile[addr]);
+        DPRINTF(WLEngine, "%s: Reducing for addr: %lu, update_value: %u, "
+                            "temp_prop: %s.\n", __func__, addr,
+                            update_value, workListFile[addr].tempProp);
+        workListFile[addr].tempProp =
+            graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine, "%s: Reduction result: %s", __func__,
+                graphWorkload->printWorkListItem(workListFile[addr]));
+        registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
+        num_reduces++;
+        stats.numReductions++;
+        toReduce.pop_front();
+        toWrite.push_back(addr);
+
+        if (num_reduces >= maxReducesPerCycle) {
+            if (!toReduce.empty()) {
+                stats.numReducerShortage++;
+            }
+            break;
+        }
+        if (toReduce.empty()) {
+            break;
+        }
+    }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
+
+    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextWriteEvent()
+{
+    int num_writes = 0;
+    while (true) {
+        Addr addr = toWrite.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_WRITE);
+        owner->recvWLWrite(addr, workListFile[addr]);
+        registerFile.erase(addr);
+        workListFile.erase(addr);
+        toWrite.pop_front();
+        num_writes++;
+        if (num_writes >= maxWritesPerCycle) {
+            if (!toWrite.empty()) {
+                stats.numWritePortShortage++;
+            }
+            break;
+        }
+        if (toWrite.empty()) {
+            break;
+        }
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+WLEngine::WorkListStats::WorkListStats(WLEngine& _wl):
+    statistics::Group(&_wl), wl(_wl),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(registerShortage, statistics::units::Count::get(),
+             "Number of times updates were "
+             "stalled because of register shortage"),
+    ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
+             "Number of times an update has been rolled back "
+             "to the back of the update queue due to cache reject."),
+    ADD_STAT(numReadPortShortage, statistics::units::Count::get(),
+             "Number of times limited by read per cycle."),
+    ADD_STAT(registerFileCoalescions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReductions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReducerShortage, statistics::units::Count::get(),
+             "Number of times limited by number of reducers."),
+    ADD_STAT(numWritePortShortage, statistics::units::Count::get(),
+             "Number of times limited by write per cycle."),
+    ADD_STAT(numIncomingUpdates, statistics::units::Count::get(),
+              "Number of inocoming updates for each GPT."),
+    ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
+             "Histogram of the latency of reading a vertex (ns)."),
+    ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of dequeuing an update (ns).")
+{
+}
+
+void
+WLEngine::WorkListStats::regStats()
+{
+    using namespace statistics;
+
+    vertexReadLatency.init(64);
+    updateQueueLatency.init(64);
+
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
new file mode 100644
index 0000000000..ad67f19cb5
--- /dev/null
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/enums.hh"
+#include "base/statistics.hh"
+#include "params/WLEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class WLEngine : public BaseReduceEngine
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner, PortID id):
+          ResponsePort(name, owner),
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    std::vector<RespPort> inPorts;
+
+    int updateQueueSize;
+    std::deque<std::tuple<Addr, Tick>> updateQueue;
+    std::unordered_map<Addr, uint32_t> valueMap;
+
+    int examineWindow;
+    int maxReadsPerCycle;
+    int maxReducesPerCycle;
+    int maxWritesPerCycle;
+
+    int registerFileSize;
+    std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
+    std::unordered_map<Addr, WorkListItem> workListFile;
+    std::deque<Addr> toReduce;
+    std::deque<Addr> toWrite;
+
+    std::unordered_map<Addr, Tick> vertexReadTime;
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextReduceEvent;
+    void processNextReduceEvent();
+
+    EventFunctionWrapper nextWriteEvent;
+    void processNextWriteEvent();
+
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
+    struct WorkListStats : public statistics::Group
+    {
+      WorkListStats(WLEngine& worklist);
+
+      void regStats() override;
+
+      WLEngine &wl;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar registerShortage;
+      statistics::Scalar numUpdateRolls;
+      statistics::Scalar numReadPortShortage;
+      statistics::Scalar registerFileCoalescions;
+      statistics::Scalar numReductions;
+      statistics::Scalar numReducerShortage;
+      statistics::Scalar numWritePortShortage;
+      statistics::Scalar numIncomingUpdates;
+
+      statistics::Histogram vertexReadLatency;
+      statistics::Histogram updateQueueLatency;
+    };
+
+    WorkListStats stats;
+
+  public:
+    PARAMS(WLEngine);
+    WLEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerMPU(MPU* mpu);
+
+    AddrRangeList getAddrRanges();
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+
+    void checkRetryReq();
+
+    bool done();
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
new file mode 100644
index 0000000000..620e97f654
--- /dev/null
+++ b/src/accl/graph/sega/work_directory.hh
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+
+#include <iostream>
+
+#include "accl/graph/base/data_structs.hh"
+#include "base/addr_range.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+
+class WorkDirectory
+{
+  public:
+    virtual int activate(Addr atom_addr) = 0;
+    virtual int deactivate(Addr atom_addr) = 0;
+    virtual Addr getNextWork() = 0;
+
+    virtual int workCount() = 0;
+    bool empty() { return workCount() == 0; }
+
+    virtual void setLastAtomAddr(Addr atom_addr) = 0;
+};
+
+class PopCountDirectory: public WorkDirectory
+{
+  private:
+    AddrRange memoryRange;
+
+    int numAtomsPerBlock;
+    int memoryAtomSize;
+    int blockSize;
+
+    uint32_t _workCount;
+
+    int numCounters;
+    int lastCounterIndex;
+    uint32_t* popCount;
+
+    uint32_t prevIndex;
+    uint32_t currentCounter;
+
+    UniqueFIFO<int> activeBlockIndices;
+
+    int getIndexFromAtomAddr(Addr atom_addr)
+    {
+        assert((atom_addr % memoryAtomSize) == 0);
+        Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr);
+        int index = (int) (trimmed_addr / blockSize);
+        return index;
+    }
+
+    Addr getAtomAddrFromIndex(int block_index, int atom_index)
+    {
+        Addr block_addr = block_index * blockSize;
+        Addr trimmed_addr = block_addr + atom_index * memoryAtomSize;
+        return memoryRange.addIntlvBits(trimmed_addr);
+    }
+
+  public:
+    PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size):
+        WorkDirectory(),
+        memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
+        memoryAtomSize(atom_size), _workCount(0),
+        prevIndex(-1), currentCounter(0)
+    {
+        blockSize = numAtomsPerBlock * memoryAtomSize;
+        int numCounters = (int) (memoryRange.size() / blockSize);
+        lastCounterIndex = numCounters - 1;
+        popCount = new uint32_t [numCounters];
+        for (int index = 0; index < numCounters; index++) {
+            popCount[index] = 0;
+        }
+        activeBlockIndices = UniqueFIFO<int>(numCounters);
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is not** tracking the the atom with atom_addr
+    virtual int activate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]++;
+        _workCount++;
+        activeBlockIndices.push_back(index);
+        assert(popCount[index] > prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is** tracking the the atom with atom_addr
+    virtual int deactivate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]--;
+        _workCount--;
+        if (popCount[index] == 0) {
+            activeBlockIndices.erase(index);
+        }
+        assert(popCount[index] < prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
+    }
+
+    virtual int workCount() { return _workCount; }
+
+    void setLastAtomAddr(Addr atom_addr)
+    {
+        lastCounterIndex = getIndexFromAtomAddr(atom_addr);
+    }
+
+    // CAUTION: This directory only tracks active vertices in the memory
+    // and it does not have any information on the state of the cache and/or
+    // the active buffer or the write buffer. Therefore, it might generate a
+    // read request to an address that might be in any of those. In that case,
+    // the generated address should be ignored.
+    virtual Addr getNextWork()
+    {
+        // Why ask directory if it's empty?
+        assert(!activeBlockIndices.empty());
+        int front_index = activeBlockIndices.front();
+        assert(popCount[front_index] > 0);
+        if ((prevIndex != -1) && (prevIndex != front_index)) {
+            currentCounter = 0;
+        }
+        if (currentCounter == numAtomsPerBlock) {
+            currentCounter = 0;
+            activeBlockIndices.pop_front();
+            activeBlockIndices.push_back(front_index);
+        }
+        int current_index = activeBlockIndices.front();
+        Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter);
+        prevIndex = current_index;
+        currentCounter++;
+        return ret_addr;
+    }
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 07bd255d26..3c5c150b29 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -48,6 +48,7 @@
 
 #include "base/bitfield.hh"
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 
@@ -732,6 +733,40 @@ class AddrRange
     {
         return !(*this == r);
     }
+
+    friend AddrRange
+    mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
+    {
+        assert(left.interleaved());
+        assert(right.interleaved());
+        assert(left.mergesWith(right));
+
+        uint8_t old_left_match = left.intlvMatch;
+        uint8_t new_left_match = 0;
+        uint8_t old_right_match = right.intlvMatch;
+        uint8_t new_right_match = 0;
+        int new_bits = left.masks.size() - 1;
+
+        // assumption: masks is sorted in ascending order
+        std::vector<Addr> new_masks;
+        for (auto mask: left.masks) {
+            uint64_t lsb_mask = (mask ^ (mask - 1)) + 1;
+            if ((lsb_mask >> 1) != (1 << pch_bit)) {
+                new_masks.push_back(mask);
+                new_left_match |= ((old_left_match & 1) << new_bits);
+                new_left_match >>= 1;
+                new_right_match |= ((old_right_match & 1) << new_bits);
+                new_right_match >>= 1;
+            }
+            old_left_match >>= 1;
+            old_right_match >>= 1;
+        }
+        panic_if(new_left_match != new_right_match,
+                    "The two ranges can not be a pseudo channel pair "
+                    "given the pseudochannel bit position of params.pch_bit.");
+
+        return AddrRange(left._start, left._end, new_masks, new_left_match);
+    }
 };
 
 static inline AddrRangeList
@@ -817,6 +852,16 @@ RangeSize(Addr start, Addr size)
     return AddrRange(start, start + size);
 }
 
+inline bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool ret = false;
+    for (auto range: range_list) {
+        ret |= range.contains(addr);
+    }
+    return ret;
+}
+
 } // namespace gem5
 
 #endif // __BASE_ADDR_RANGE_HH__
diff --git a/src/base/statistics.hh b/src/base/statistics.hh
index 24cbf714f5..15aeff892e 100644
--- a/src/base/statistics.hh
+++ b/src/base/statistics.hh
@@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec<Derived, VectorInfoProxy>
     Proxy
     operator[](off_type index)
     {
-        assert (index < size());
+        // assert (index < size());
         return Proxy(this->self(), index);
     }
 };
diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index 0c7c1ea919..f32ffe6f0a 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -42,9 +42,9 @@ class HBMCtrl(MemCtrl):
     # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
     dram_2 = Param.DRAMInterface("DRAM memory interface")
 
+    pch_bit = Param.Int("Position of PseudoChannel bit in addresses.")
+
     # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
     min_writes_per_switch = 64
-
-    partitioned_q = Param.Bool(False, "split queues for pseudo channels")
diff --git a/src/mem/dram_interface.cc b/src/mem/dram_interface.cc
index d745fe5a29..d8c6da0a2d 100644
--- a/src/mem/dram_interface.cc
+++ b/src/mem/dram_interface.cc
@@ -1068,13 +1068,14 @@ DRAMInterface::minBankPrep(const MemPacketQueue& queue,
 
                 // latest Tick for which ACT can occur without
                 // incurring additoinal delay on the data bus
-                const Tick tRCD = ctrl->inReadBusState(false) ?
-                                                 tRCD_RD : tRCD_WR;
+                const Tick tRCD = ctrl->inReadBusState(false,
+                                    (MemInterface*)(this)) ? tRCD_RD : tRCD_WR;
                 const Tick hidden_act_max =
                             std::max(min_col_at - tRCD, curTick());
 
                 // When is the earliest the R/W burst can issue?
-                const Tick col_allowed_at = ctrl->inReadBusState(false) ?
+                const Tick col_allowed_at = ctrl->inReadBusState(false,
+                                              (MemInterface*)(this)) ?
                                               ranks[i]->banks[j].rdAllowedAt :
                                               ranks[i]->banks[j].wrAllowedAt;
                 Tick col_at = std::max(col_allowed_at, act_at + tRCD);
@@ -1180,10 +1181,10 @@ bool
 DRAMInterface::Rank::isQueueEmpty() const
 {
     // check commmands in Q based on current bus direction
-    bool no_queued_cmds = (dram.ctrl->inReadBusState(true) &&
-                          (readEntries == 0))
-                       || (dram.ctrl->inWriteBusState(true) &&
-                          (writeEntries == 0));
+    bool no_queued_cmds = (dram.ctrl->inReadBusState(true,
+                          (MemInterface*)(this)) && (readEntries == 0)) ||
+                          (dram.ctrl->inWriteBusState(true,
+                          (MemInterface*)(this)) && (writeEntries == 0));
     return no_queued_cmds;
 }
 
@@ -1669,7 +1670,7 @@ DRAMInterface::Rank::processPowerEvent()
         // completed refresh event, ensure next request is scheduled
         if (!(dram.ctrl->requestEventScheduled(dram.pseudoChannel))) {
             DPRINTF(DRAM, "Scheduling next request after refreshing"
-                           " rank %d\n", rank);
+                           " rank %d, PC %d \n", rank, dram.pseudoChannel);
             dram.ctrl->restartScheduler(curTick(), dram.pseudoChannel);
         }
     }
@@ -1831,7 +1832,8 @@ DRAMInterface::Rank::resetStats() {
 bool
 DRAMInterface::Rank::forceSelfRefreshExit() const {
     return (readEntries != 0) ||
-           (dram.ctrl->inWriteBusState(true) && (writeEntries != 0));
+           (dram.ctrl->inWriteBusState(true, (MemInterface*)(this))
+           && (writeEntries != 0));
 }
 
 void
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index 99618c4b5f..e0d0922333 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,14 +45,14 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
+    pchBit(p.pch_bit),
     retryRdReqPC1(false), retryWrReqPC1(false),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
                          name()),
     respondEventPC1([this] {processRespondEvent(pc1Int, respQueuePC1,
                          respondEventPC1, retryRdReqPC1); }, name()),
-    pc1Int(p.dram_2),
-    partitionedQ(p.partitioned_q)
+    pc1Int(p.dram_2)
 {
     DPRINTF(MemCtrl, "Setting up HBM controller\n");
 
@@ -69,17 +69,8 @@ HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     pc0Int->setCtrl(this, commandWindow, 0);
     pc1Int->setCtrl(this, commandWindow, 1);
 
-    if (partitionedQ) {
-        writeHighThreshold = (writeBufferSize * (p.write_high_thresh_perc/2)
-                             / 100.0);
-        writeLowThreshold = (writeBufferSize * (p.write_low_thresh_perc/2)
-                            / 100.0);
-    } else {
-        writeHighThreshold = (writeBufferSize * p.write_high_thresh_perc
-                            / 100.0);
-        writeLowThreshold = (writeBufferSize * p.write_low_thresh_perc
-                            / 100.0);
-    }
+    writeHighThreshold = (writeBufferSize/2 * p.write_high_thresh_perc)/100.0;
+    writeLowThreshold = (writeBufferSize/2 * p.write_low_thresh_perc)/100.0;
 }
 
 void
@@ -155,9 +146,9 @@ HBMCtrl::writeQueueFullPC0(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Write queue limit %d, PC0 size %d, entries needed %d\n",
-            writeBufferSize, writeQueueSizePC0, neededEntries);
+            writeBufferSize/2, pc0Int->writeQueueSize, neededEntries);
 
-    unsigned int wrsize_new = (writeQueueSizePC0 + neededEntries);
+    unsigned int wrsize_new = (pc0Int->writeQueueSize + neededEntries);
     return wrsize_new > (writeBufferSize/2);
 }
 
@@ -166,9 +157,9 @@ HBMCtrl::writeQueueFullPC1(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Write queue limit %d, PC1 size %d, entries needed %d\n",
-            writeBufferSize, writeQueueSizePC1, neededEntries);
+            writeBufferSize/2, pc1Int->writeQueueSize, neededEntries);
 
-    unsigned int wrsize_new = (writeQueueSizePC1 + neededEntries);
+    unsigned int wrsize_new = (pc1Int->writeQueueSize + neededEntries);
     return wrsize_new > (writeBufferSize/2);
 }
 
@@ -177,10 +168,10 @@ HBMCtrl::readQueueFullPC0(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Read queue limit %d, PC0 size %d, entries needed %d\n",
-            readBufferSize, readQueueSizePC0 + respQueue.size(),
+            readBufferSize/2, pc0Int->readQueueSize + respQueue.size(),
             neededEntries);
 
-    unsigned int rdsize_new = readQueueSizePC0 + respQueue.size()
+    unsigned int rdsize_new = pc0Int->readQueueSize + respQueue.size()
                                                + neededEntries;
     return rdsize_new > (readBufferSize/2);
 }
@@ -190,26 +181,14 @@ HBMCtrl::readQueueFullPC1(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Read queue limit %d, PC1 size %d, entries needed %d\n",
-            readBufferSize, readQueueSizePC1 + respQueuePC1.size(),
+            readBufferSize/2, pc1Int->readQueueSize + respQueuePC1.size(),
             neededEntries);
 
-    unsigned int rdsize_new = readQueueSizePC1 + respQueuePC1.size()
+    unsigned int rdsize_new = pc1Int->readQueueSize + respQueuePC1.size()
                                                + neededEntries;
     return rdsize_new > (readBufferSize/2);
 }
 
-bool
-HBMCtrl::readQueueFull(unsigned int neededEntries) const
-{
-    DPRINTF(MemCtrl,
-            "HBMCtrl: Read queue limit %d, entries needed %d\n",
-            readBufferSize, neededEntries);
-
-    unsigned int rdsize_new = totalReadQueueSize + respQueue.size() +
-                                respQueuePC1.size() + neededEntries;
-    return rdsize_new > readBufferSize;
-}
-
 bool
 HBMCtrl::recvTimingReq(PacketPtr pkt)
 {
@@ -233,7 +212,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     bool is_pc0;
 
     // TODO: make the interleaving bit across pseudo channels a parameter
-    if (bits(pkt->getAddr(), 6) == 0) {
+    if (bits(pkt->getAddr(), pchBit) == 0) {
         is_pc0 = true;
     } else {
         is_pc0 = false;
@@ -254,9 +233,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     // check local buffers and do not accept if full
     if (pkt->isWrite()) {
         if (is_pc0) {
-            if (partitionedQ ? writeQueueFullPC0(pkt_count) :
-                                        writeQueueFull(pkt_count))
-            {
+            if (writeQueueFullPC0(pkt_count)) {
                 DPRINTF(MemCtrl, "Write queue full, not accepting\n");
                 // remember that we have to retry this port
                 MemCtrl::retryWrReq = true;
@@ -264,13 +241,15 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
                 return false;
             } else {
                 addToWriteQueue(pkt, pkt_count, pc0Int);
+                if (!nextReqEvent.scheduled()) {
+                    DPRINTF(MemCtrl, "Request scheduled immediately\n");
+                    schedule(nextReqEvent, curTick());
+                }
                 stats.writeReqs++;
                 stats.bytesWrittenSys += size;
             }
         } else {
-            if (partitionedQ ? writeQueueFullPC1(pkt_count) :
-                                        writeQueueFull(pkt_count))
-            {
+            if (writeQueueFullPC1(pkt_count)) {
                 DPRINTF(MemCtrl, "Write queue full, not accepting\n");
                 // remember that we have to retry this port
                 retryWrReqPC1 = true;
@@ -278,6 +257,10 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
                 return false;
             } else {
                 addToWriteQueue(pkt, pkt_count, pc1Int);
+                if (!nextReqEventPC1.scheduled()) {
+                    DPRINTF(MemCtrl, "Request scheduled immediately\n");
+                    schedule(nextReqEventPC1, curTick());
+                }
                 stats.writeReqs++;
                 stats.bytesWrittenSys += size;
             }
@@ -288,11 +271,10 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
         assert(size != 0);
 
         if (is_pc0) {
-            if (partitionedQ ? readQueueFullPC0(pkt_count) :
-                                        HBMCtrl::readQueueFull(pkt_count)) {
+            if (readQueueFullPC0(pkt_count)) {
                 DPRINTF(MemCtrl, "Read queue full, not accepting\n");
                 // remember that we have to retry this port
-                retryRdReqPC1 = true;
+                MemCtrl::retryRdReq = true;
                 stats.numRdRetry++;
                 return false;
             } else {
@@ -307,8 +289,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
                 stats.bytesReadSys += size;
             }
         } else {
-            if (partitionedQ ? readQueueFullPC1(pkt_count) :
-                                        HBMCtrl::readQueueFull(pkt_count)) {
+            if (readQueueFullPC1(pkt_count)) {
                 DPRINTF(MemCtrl, "Read queue full, not accepting\n");
                 // remember that we have to retry this port
                 retryRdReqPC1 = true;
@@ -492,8 +473,11 @@ AddrRangeList
 HBMCtrl::getAddrRanges()
 {
     AddrRangeList ranges;
-    ranges.push_back(pc0Int->getAddrRange());
-    ranges.push_back(pc1Int->getAddrRange());
+    AddrRange pc0Int_range = pc0Int->getAddrRange();
+    AddrRange pc1Int_range = pc1Int->getAddrRange();
+    ranges.push_back(
+                mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit)
+                    );
     return ranges;
 }
 
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index c9045f0ae7..58cbd57c3b 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl
     }
 
   private:
-
+    // Position of the pseudochannel bit in addresses.
+    int pchBit;
     /**
      * Remember if we have to retry a request for second pseudo channel.
      */
@@ -144,7 +145,6 @@ class HBMCtrl : public MemCtrl
      */
     bool readQueueFullPC0(unsigned int pkt_count) const;
     bool readQueueFullPC1(unsigned int pkt_count) const;
-    bool readQueueFull(unsigned int pkt_count) const;
 
     /**
      * Check if the write queue partition of both pseudo
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index c65d68a5a7..731ce7be39 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -72,7 +72,6 @@ MemCtrl::MemCtrl(const MemCtrlParams &p) :
     writeLowThreshold(writeBufferSize * p.write_low_thresh_perc / 100.0),
     minWritesPerSwitch(p.min_writes_per_switch),
     minReadsPerSwitch(p.min_reads_per_switch),
-    writesThisTime(0), readsThisTime(0),
     memSchedPolicy(p.mem_sched_policy),
     frontendLatency(p.static_frontend_latency),
     backendLatency(p.static_backend_latency),
@@ -212,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 
@@ -277,6 +276,8 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
             logRequest(MemCtrl::READ, pkt->requestorId(),
                        pkt->qosValue(), mem_pkt->addr, 1);
 
+            mem_intr->readQueueSize++;
+
             // Update stats
             stats.avgRdQLen = totalReadQueueSize + respQueue.size();
         }
@@ -349,6 +350,8 @@ MemCtrl::addToWriteQueue(PacketPtr pkt, unsigned int pkt_count,
             logRequest(MemCtrl::WRITE, pkt->requestorId(),
                        pkt->qosValue(), mem_pkt->addr, 1);
 
+            mem_intr->writeQueueSize++;
+
             assert(totalWriteQueueSize == isInWriteQueue.size());
 
             // Update stats
@@ -575,6 +578,9 @@ MemCtrl::chooseNext(MemPacketQueue& queue, Tick extra_col_delay,
             // check if there is a packet going to a free rank
             for (auto i = queue.begin(); i != queue.end(); ++i) {
                 MemPacket* mem_pkt = *i;
+                if (mem_pkt->pseudoChannel != mem_intr->pseudoChannel) {
+                    continue;
+                }
                 if (packetReady(mem_pkt, mem_intr)) {
                     ret = i;
                     break;
@@ -761,28 +767,28 @@ MemCtrl::verifyMultiCmd(Tick cmd_tick, Tick max_cmds_per_burst,
 }
 
 bool
-MemCtrl::inReadBusState(bool next_state) const
+MemCtrl::inReadBusState(bool next_state, MemInterface* mem_intr) const
 {
     // check the bus state
     if (next_state) {
         // use busStateNext to get the state that will be used
         // for the next burst
-        return (busStateNext == MemCtrl::READ);
+        return (mem_intr->busStateNext == MemCtrl::READ);
     } else {
-        return (busState == MemCtrl::READ);
+        return (mem_intr->busState == MemCtrl::READ);
     }
 }
 
 bool
-MemCtrl::inWriteBusState(bool next_state) const
+MemCtrl::inWriteBusState(bool next_state, MemInterface* mem_intr) const
 {
     // check the bus state
     if (next_state) {
         // use busStateNext to get the state that will be used
         // for the next burst
-        return (busStateNext == MemCtrl::WRITE);
+        return (mem_intr->busStateNext == MemCtrl::WRITE);
     } else {
-        return (busState == MemCtrl::WRITE);
+        return (mem_intr->busState == MemCtrl::WRITE);
     }
 }
 
@@ -813,13 +819,13 @@ MemCtrl::doBurstAccess(MemPacket* mem_pkt, MemInterface* mem_intr)
 
     // Update the common bus stats
     if (mem_pkt->isRead()) {
-        ++readsThisTime;
+        ++(mem_intr->readsThisTime);
         // Update latency stats
         stats.requestorReadTotalLat[mem_pkt->requestorId()] +=
             mem_pkt->readyTime - mem_pkt->entryTime;
         stats.requestorReadBytes[mem_pkt->requestorId()] += mem_pkt->size;
     } else {
-        ++writesThisTime;
+        ++(mem_intr->writesThisTime);
         stats.requestorWriteBytes[mem_pkt->requestorId()] += mem_pkt->size;
         stats.requestorWriteTotalLat[mem_pkt->requestorId()] +=
             mem_pkt->readyTime - mem_pkt->entryTime;
@@ -836,8 +842,8 @@ MemCtrl::memBusy(MemInterface* mem_intr) {
     // Default to busy status and update based on interface specifics
     // Default state of unused interface is 'true'
     bool mem_busy = true;
-    bool all_writes_nvm = mem_intr->numWritesQueued == totalWriteQueueSize;
-    bool read_queue_empty = totalReadQueueSize == 0;
+    bool all_writes_nvm = mem_intr->numWritesQueued == mem_intr->writeQueueSize;
+    bool read_queue_empty = mem_intr->readQueueSize == 0;
     mem_busy = mem_intr->isBusy(read_queue_empty, all_writes_nvm);
     if (mem_busy) {
         // if all ranks are refreshing wait for them to finish
@@ -884,32 +890,32 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     }
 
     // detect bus state change
-    bool switched_cmd_type = (busState != busStateNext);
+    bool switched_cmd_type = (mem_intr->busState != mem_intr->busStateNext);
     // record stats
     recordTurnaroundStats();
 
     DPRINTF(MemCtrl, "QoS Turnarounds selected state %s %s\n",
-            (busState==MemCtrl::READ)?"READ":"WRITE",
+            (mem_intr->busState==MemCtrl::READ)?"READ":"WRITE",
             switched_cmd_type?"[turnaround triggered]":"");
 
     if (switched_cmd_type) {
-        if (busState == MemCtrl::READ) {
+        if (mem_intr->busState == MemCtrl::READ) {
             DPRINTF(MemCtrl,
                     "Switching to writes after %d reads with %d reads "
-                    "waiting\n", readsThisTime, totalReadQueueSize);
-            stats.rdPerTurnAround.sample(readsThisTime);
-            readsThisTime = 0;
+                    "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize);
+            stats.rdPerTurnAround.sample(mem_intr->readsThisTime);
+            mem_intr->readsThisTime = 0;
         } else {
             DPRINTF(MemCtrl,
                     "Switching to reads after %d writes with %d writes "
-                    "waiting\n", writesThisTime, totalWriteQueueSize);
-            stats.wrPerTurnAround.sample(writesThisTime);
-            writesThisTime = 0;
+                    "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize);
+            stats.wrPerTurnAround.sample(mem_intr->writesThisTime);
+            mem_intr->writesThisTime = 0;
         }
     }
 
     // updates current state
-    busState = busStateNext;
+    mem_intr->busState = mem_intr->busStateNext;
 
     nonDetermReads(mem_intr);
 
@@ -918,18 +924,18 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     }
 
     // when we get here it is either a read or a write
-    if (busState == READ) {
+    if (mem_intr->busState == READ) {
 
         // track if we should switch or not
         bool switch_to_writes = false;
 
-        if (totalReadQueueSize == 0) {
+        if (mem_intr->readQueueSize == 0) {
             // In the case there is no read request to go next,
             // trigger writes if we have passed the low threshold (or
             // if we are draining)
-            if (!(totalWriteQueueSize == 0) &&
+            if (!(mem_intr->writeQueueSize == 0) &&
                 (drainState() == DrainState::Draining ||
-                 totalWriteQueueSize > writeLowThreshold)) {
+                 mem_intr->writeQueueSize > writeLowThreshold)) {
 
                 DPRINTF(MemCtrl,
                         "Switching to writes due to read queue empty\n");
@@ -1004,6 +1010,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
                         mem_pkt->qosValue(), mem_pkt->getAddr(), 1,
                         mem_pkt->readyTime - mem_pkt->entryTime);
 
+            mem_intr->readQueueSize--;
 
             // Insert into response queue. It will be sent back to the
             // requestor at its readyTime
@@ -1022,8 +1029,8 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
             // there are no other writes that can issue
             // Also ensure that we've issued a minimum defined number
             // of reads before switching, or have emptied the readQ
-            if ((totalWriteQueueSize > writeHighThreshold) &&
-               (readsThisTime >= minReadsPerSwitch || totalReadQueueSize == 0)
+            if ((mem_intr->writeQueueSize > writeHighThreshold) &&
+               (mem_intr->readsThisTime >= minReadsPerSwitch || mem_intr->readQueueSize == 0)
                && !(nvmWriteBlock(mem_intr))) {
                 switch_to_writes = true;
             }
@@ -1038,7 +1045,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
         // draining), or because the writes hit the hight threshold
         if (switch_to_writes) {
             // transition to writing
-            busStateNext = WRITE;
+            mem_intr->busStateNext = WRITE;
         }
     } else {
 
@@ -1092,6 +1099,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
                     mem_pkt->qosValue(), mem_pkt->getAddr(), 1,
                     mem_pkt->readyTime - mem_pkt->entryTime);
 
+        mem_intr->writeQueueSize--;
 
         // remove the request from the queue - the iterator is no longer valid
         writeQueue[mem_pkt->qosValue()].erase(to_write);
@@ -1105,15 +1113,15 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
         // If we are interfacing to NVM and have filled the writeRespQueue,
         // with only NVM writes in Q, then switch to reads
         bool below_threshold =
-            totalWriteQueueSize + minWritesPerSwitch < writeLowThreshold;
+            mem_intr->writeQueueSize + minWritesPerSwitch < writeLowThreshold;
 
-        if (totalWriteQueueSize == 0 ||
+        if (mem_intr->writeQueueSize == 0 ||
             (below_threshold && drainState() != DrainState::Draining) ||
-            (totalReadQueueSize && writesThisTime >= minWritesPerSwitch) ||
-            (totalReadQueueSize && (nvmWriteBlock(mem_intr)))) {
+            (mem_intr->readQueueSize && mem_intr->writesThisTime >= minWritesPerSwitch) ||
+            (mem_intr->readQueueSize && (nvmWriteBlock(mem_intr)))) {
 
             // turn the bus back around for reads again
-            busStateNext = MemCtrl::READ;
+            mem_intr->busStateNext = MemCtrl::READ;
 
             // note that the we switch back to reads also in the idle
             // case, which eventually will check for any draining and
@@ -1126,7 +1134,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     if (!next_req_event.scheduled())
         schedule(next_req_event, std::max(mem_intr->nextReqTime, curTick()));
 
-    if (retry_wr_req && totalWriteQueueSize < writeBufferSize) {
+    if (retry_wr_req && mem_intr->writeQueueSize < writeBufferSize) {
         retry_wr_req = false;
         port.sendRetryReq();
     }
@@ -1400,7 +1408,7 @@ MemCtrl::drain()
 {
     // if there is anything in any of our internal queues, keep track
     // of that as well
-    if (!(!totalWriteQueueSize && !totalReadQueueSize && respQueue.empty() &&
+    if (!(!totalWriteQueueSize && !totalReadQueueSize && respQEmpty() &&
           allIntfDrained())) {
 
         DPRINTF(Drain, "Memory controller not drained, write: %d, read: %d,"
diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh
index fe5d478280..fffd05405e 100644
--- a/src/mem/mem_ctrl.hh
+++ b/src/mem/mem_ctrl.hh
@@ -515,8 +515,6 @@ class MemCtrl : public qos::MemCtrl
     uint32_t writeLowThreshold;
     const uint32_t minWritesPerSwitch;
     const uint32_t minReadsPerSwitch;
-    uint32_t writesThisTime;
-    uint32_t readsThisTime;
 
     /**
      * Memory controller configuration initialized based on parameter
@@ -762,7 +760,7 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a read state
      */
-    bool inReadBusState(bool next_state) const;
+    bool inReadBusState(bool next_state, MemInterface* mem_intr) const;
 
     /**
      * Check the current direction of the memory channel
@@ -770,7 +768,7 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a write state
      */
-    bool inWriteBusState(bool next_state) const;
+    bool inWriteBusState(bool next_state, MemInterface* mem_intr) const;
 
     Port &getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/mem/mem_interface.hh b/src/mem/mem_interface.hh
index 8d6f4fe52b..b0f762fc80 100644
--- a/src/mem/mem_interface.hh
+++ b/src/mem/mem_interface.hh
@@ -189,6 +189,28 @@ class MemInterface : public AbstractMemory
     Tick nextBurstAt = 0;
     Tick nextReqTime = 0;
 
+    /**
+     * Reads/writes performed by the controller for this interface before
+     * bus direction is switched
+     */
+    uint32_t readsThisTime = 0;
+    uint32_t writesThisTime = 0;
+
+    /**
+     * Read/write packets in the read/write queue for this interface
+     * qos/mem_ctrl.hh has similar counters, but they track all packets
+     * in the controller for all memory interfaces connected to the
+     * controller.
+     */
+    uint32_t readQueueSize = 0;
+    uint32_t writeQueueSize = 0;
+
+
+    MemCtrl::BusState busState = MemCtrl::READ;
+
+    /** bus state for next request event triggered */
+    MemCtrl::BusState busStateNext = MemCtrl::READ;
+
     /**
      * pseudo channel number used for HBM modeling
      */
diff --git a/src/mem/nvm_interface.cc b/src/mem/nvm_interface.cc
index b2c4073cd9..e77cf59202 100644
--- a/src/mem/nvm_interface.cc
+++ b/src/mem/nvm_interface.cc
@@ -402,9 +402,11 @@ NVMInterface::processReadReadyEvent()
 
 bool
 NVMInterface::burstReady(MemPacket* pkt) const {
-    bool read_rdy =  pkt->isRead() && (ctrl->inReadBusState(true)) &&
-               (pkt->readyTime <= curTick()) && (numReadDataReady > 0);
-    bool write_rdy =  !pkt->isRead() && !ctrl->inReadBusState(true) &&
+    bool read_rdy =  pkt->isRead() && (ctrl->inReadBusState(true,
+                (MemInterface*)(this))) &&
+                (pkt->readyTime <= curTick()) && (numReadDataReady > 0);
+    bool write_rdy =  !pkt->isRead() && !ctrl->inReadBusState(true,
+                (MemInterface*)(this)) &&
                 !writeRespQueueFull();
     return (read_rdy || write_rdy);
 }
@@ -613,7 +615,7 @@ NVMInterface::isBusy(bool read_queue_empty, bool all_writes_nvm)
      // Only assert busy for the write case when there are also
      // no reads in Q and the write queue only contains NVM commands
      // This allows the bus state to switch and service reads
-     return (ctrl->inReadBusState(true) ?
+     return (ctrl->inReadBusState(true, (MemInterface*)(this)) ?
                  (numReadDataReady == 0) && !read_queue_empty :
                  writeRespQueueFull() && read_queue_empty &&
                                          all_writes_nvm);
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 31dc330cab..daf9d18e88 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -237,6 +237,7 @@ MemCmd::commandInfo[] =
     { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" },
     { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" },
     { {IsRequest}, InvalidCmd, "TlbiExtSync" },
+    { {IsRequest, HasData}, InvalidCmd, "UpdateWL"}
 };
 
 AddrRange
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbec00..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -148,6 +148,8 @@ class MemCmd
         HTMAbort,
         // Tlb shootdown
         TlbiExtSync,
+        // MPU Accelerator
+        UpdateWL,
         NUM_MEM_CMDS
     };
 
diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc
index 19e1a53e84..55145ab7d7 100644
--- a/src/mem/port_proxy.cc
+++ b/src/mem/port_proxy.cc
@@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) :
 
 void
 PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
-                        void *p, int size) const
+                        void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
-                         const void *p, int size) const
+                         const void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags,
-                          uint8_t v, int size) const
+                          uint8_t v, Addr size) const
 {
     // quick and dirty...
     uint8_t *buf = new uint8_t[size];
diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh
index 29f6ba60a4..8cd21322ea 100644
--- a/src/mem/port_proxy.hh
+++ b/src/mem/port_proxy.hh
@@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol
      * Read size bytes memory at physical address and store in p.
      */
     void readBlobPhys(Addr addr, Request::Flags flags,
-                      void *p, int size) const;
+                      void *p, Addr size) const;
 
     /**
      * Write size bytes from p to physical address.
      */
     void writeBlobPhys(Addr addr, Request::Flags flags,
-                       const void *p, int size) const;
+                       const void *p, Addr size) const;
 
     /**
      * Fill size bytes starting at physical addr with byte value val.
      */
     void memsetBlobPhys(Addr addr, Request::Flags flags,
-                        uint8_t v, int size) const;
+                        uint8_t v, Addr size) const;
 
 
 
@@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryReadBlob(Addr addr, void *p, int size) const
+    tryReadBlob(Addr addr, void *p, Addr size) const
     {
         readBlobPhys(addr, 0, p, size);
         return true;
@@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryWriteBlob(Addr addr, const void *p, int size) const
+    tryWriteBlob(Addr addr, const void *p, Addr size) const
     {
         writeBlobPhys(addr, 0, p, size);
         return true;
@@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryMemsetBlob(Addr addr, uint8_t val, int size) const
+    tryMemsetBlob(Addr addr, uint8_t val, Addr size) const
     {
         memsetBlobPhys(addr, 0, val, size);
         return true;
@@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryReadBlob, but insists on success.
      */
     void
-    readBlob(Addr addr, void *p, int size) const
+    readBlob(Addr addr, void *p, Addr size) const
     {
         if (!tryReadBlob(addr, p, size))
             fatal("readBlob(%#x, ...) failed", addr);
@@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryWriteBlob, but insists on success.
      */
     void
-    writeBlob(Addr addr, const void *p, int size) const
+    writeBlob(Addr addr, const void *p, Addr size) const
     {
         if (!tryWriteBlob(addr, p, size))
             fatal("writeBlob(%#x, ...) failed", addr);
@@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryMemsetBlob, but insists on success.
      */
     void
-    memsetBlob(Addr addr, uint8_t v, int size) const
+    memsetBlob(Addr addr, uint8_t v, Addr size) const
     {
         if (!tryMemsetBlob(addr, v, size))
             fatal("memsetBlob(%#x, ...) failed", addr);
diff --git a/src/mem/simple_mem.hh b/src/mem/simple_mem.hh
index fc6d6849d5..f57ef33629 100644
--- a/src/mem/simple_mem.hh
+++ b/src/mem/simple_mem.hh
@@ -178,7 +178,6 @@ class SimpleMemory : public AbstractMemory
     std::unique_ptr<Packet> pendingDelete;
 
   public:
-
     SimpleMemory(const SimpleMemoryParams &p);
 
     DrainState drain() override;
@@ -187,6 +186,8 @@ class SimpleMemory : public AbstractMemory
                   PortID idx=InvalidPortID) override;
     void init() override;
 
+    double getBW() { return bandwidth; }
+
   protected:
     Tick recvAtomic(PacketPtr pkt);
     Tick recvAtomicBackdoor(PacketPtr pkt, MemBackdoorPtr &_backdoor);
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8ab859f40d..bc698c1a07 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen,
 }
 
 bool
-TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
+TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Read;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
 
 bool
 TranslatingPortProxy::tryWriteBlob(
-        Addr addr, const void *p, int size) const
+        Addr addr, const void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob(
 }
 
 bool
-TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const
+TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh
index bedb57a3ce..7e619784b1 100644
--- a/src/mem/translating_port_proxy.hh
+++ b/src/mem/translating_port_proxy.hh
@@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy
 
     /** Version of tryReadblob that translates virt->phys and deals
       * with page boundries. */
-    bool tryReadBlob(Addr addr, void *p, int size) const override;
+    bool tryReadBlob(Addr addr, void *p, Addr size) const override;
 
     /** Version of tryWriteBlob that translates virt->phys and deals
       * with page boundries. */
-    bool tryWriteBlob(Addr addr, const void *p, int size) const override;
+    bool tryWriteBlob(Addr addr, const void *p, Addr size) const override;
 
     /**
      * Fill size bytes starting at addr with byte value val.
      */
-    bool tryMemsetBlob(Addr address, uint8_t  v, int size) const override;
+    bool tryMemsetBlob(Addr address, uint8_t  v, Addr size) const override;
 };
 
 } // namespace gem5
diff --git a/src/python/gem5/components/memory/hbm.py b/src/python/gem5/components/memory/hbm.py
index 35497c2f89..75db1f9fde 100644
--- a/src/python/gem5/components/memory/hbm.py
+++ b/src/python/gem5/components/memory/hbm.py
@@ -122,7 +122,6 @@ def _interleave_addresses(self):
         # for interleaving across pseudo channels (at 64B currently)
         mask_list.insert(0, 1 << 6)
         for i, ctrl in enumerate(self.mem_ctrl):
-            ctrl.partitioned_q = False
             ctrl.dram.range = AddrRange(
                 start=self._mem_range.start,
                 size=self._mem_range.size(),