diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py new file mode 100644 index 0000000000..5bd4f76209 --- /dev/null +++ b/configs/accl/async-pr.py @@ -0,0 +1,132 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("r_queue_size", type=int) + argparser.add_argument("r_latency", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.r_queue_size, + args.r_latency, + args.graph, + args.alpha, + args.threshold, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + r_queue_size, + r_latency, + graph, + alpha, + threshold, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple_pt2pt import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_async_pr_workload(alpha, threshold) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/bc.py b/configs/accl/bc.py new file mode 100644 index 0000000000..9a0bf298b5 --- /dev/null +++ b/configs/accl/bc.py @@ -0,0 +1,162 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("r_queue_size", type=int) + argparser.add_argument("r_latency", type=int) + argparser.add_argument("gpt_per_gpn", type=int) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument("sample_time", type=str) + argparser.add_argument("tokens", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--pt2pt", + dest="pt2pt", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.r_queue_size, + args.r_latency, + args.gpt_per_gpn, + args.graph, + args.init_addr, + args.init_value, + args.sample_time, + args.tokens, + args.simple, + args.pt2pt, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + r_queue_size, + r_latency, + gpt_per_gpn, + graph, + init_addr, + init_value, + sample_time, + tokens, + simple, + pt2pt, + sample, + verify, + ) = get_inputs() + + if simple: + if pt2pt: + from sega_simple_pt2pt import SEGA + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens) + else: + from sega_simple import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_bc_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + iterations = 0 + while True: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iterations += 1 + if system.work_count() == 0: + break + print(f"#iterations: {iterations}") + if verify: + system.print_answer() diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py new file mode 100644 index 0000000000..c2150ce751 --- /dev/null +++ b/configs/accl/bfs.py @@ -0,0 +1,202 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("r_queue_size", type=int) + argparser.add_argument("r_latency", type=int) + argparser.add_argument("gpt_per_gpn", type=int) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument("sample_time", type=str) + argparser.add_argument("tokens", type=int) + argparser.add_argument( + "--tile", + dest="tile", + action="store_const", + const=True, + default=False, + help="Whether to use temporal partitioning", + ) + argparser.add_argument( + "--best", + dest="best", + action="store_const", + const=True, + default=False, + help="Whether to use best update value for switching slices", + ) + argparser.add_argument( + "--visited", + dest="visited", + action="store_const", + const=True, + default=False, + help="Use visitation version of BFS", + ) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--pt2pt", + dest="pt2pt", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.r_queue_size, + args.r_latency, + args.gpt_per_gpn, + args.graph, + args.init_addr, + args.init_value, + args.sample_time, + args.tokens, + args.tile, + args.best, + args.visited, + args.simple, + args.pt2pt, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + r_queue_size, + r_latency, + gpt_per_gpn, + graph, + init_addr, + init_value, + sample_time, + tokens, + tile, + best, + visited, + simple, + pt2pt, + sample, + verify, + ) = get_inputs() + + if simple: + if pt2pt: + from sega_simple_pt2pt import SEGA + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens) + else: + from sega_simple import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + else: + from sega import SEGA + + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, graph) + + root = Root(full_system=False, system=system) + m5.instantiate() + if tile: + system.set_pg_mode() + else: + system.set_async_mode() + + system.create_pop_count_directory(64) + if visited: + system.create_bfs_visited_workload(init_addr, init_value) + else: + system.create_bfs_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(50000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if exit_event.getCause() == "simulate() limit reached": + m5.stats.dump() + m5.stats.reset() + elif exit_event.getCause() == "Done with all the slices.": + break + elif exit_event.getCause() == "no update left to process.": + break + else: + while True: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if exit_event.getCause() == "Done with all the slices.": + break + if exit_event.getCause() == "no update left to process.": + break + if verify: + system.print_answer() diff --git a/configs/accl/cc.py b/configs/accl/cc.py new file mode 100644 index 0000000000..03b3d04d46 --- /dev/null +++ b/configs/accl/cc.py @@ -0,0 +1,150 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("r_queue_size", type=int) + argparser.add_argument("r_latency", type=int) + argparser.add_argument("gpt_per_gpn", type=int) + argparser.add_argument("graph", type=str) + argparser.add_argument("sample_time", type=str) + argparser.add_argument("tokens", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--pt2pt", + dest="pt2pt", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.r_queue_size, + args.r_latency, + args.gpt_per_gpn, + args.graph, + args.sample_time, + args.tokens, + args.simple, + args.pt2pt, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + r_queue_size, + r_latency, + gpt_per_gpn, + graph, + sample_time, + tokens, + simple, + pt2pt, + sample, + verify, + ) = get_inputs() + print(sample_time) + if simple: + if pt2pt: + from sega_simple_pt2pt import SEGA + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens) + else: + from sega_simple import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_cc_workload() + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/pr.py b/configs/accl/pr.py new file mode 100644 index 0000000000..7ef6587ab3 --- /dev/null +++ b/configs/accl/pr.py @@ -0,0 +1,173 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("r_queue_size", type=int) + argparser.add_argument("r_latency", type=int) + argparser.add_argument("gpt_per_gpn", type=int) + argparser.add_argument("graph", type=str) + argparser.add_argument("iterations", type=int) + argparser.add_argument("alpha", type=float) + argparser.add_argument("sample_time", type=str) + argparser.add_argument("tokens", type=int) + argparser.add_argument("--num_nodes", type=int, default=1) + argparser.add_argument("--error_threshold", type=float, default=0.0) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--pt2pt", + dest="pt2pt", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.r_queue_size, + args.r_latency, + args.gpt_per_gpn, + args.graph, + args.iterations, + args.alpha, + args.num_nodes, + args.error_threshold, + args.sample_time, + args.tokens, + args.simple, + args.pt2pt, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + r_queue_size, + r_latency, + gpt_per_gpn, + graph, + iterations, + alpha, + num_nodes, + error_threshold, + sample_time, + tokens, + simple, + pt2pt, + sample, + verify, + ) = get_inputs() + + print(f"error_threshold: {error_threshold}") + + if simple: + if pt2pt: + from sega_simple_pt2pt import SEGA + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens) + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_pr_workload(num_nodes, alpha) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + iteration = 0 + while iteration < iterations: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iteration += 1 + print(f"error: {system.get_pr_error()}") + if system.get_pr_error() < error_threshold: + break + if system.work_count() == 0: + break + print(f"#iterations: {iteration}") + if verify: + system.print_answer() diff --git a/configs/accl/sega.py b/configs/accl/sega.py new file mode 100644 index 0000000000..17d84bd86c --- /dev/null +++ b/configs/accl/sega.py @@ -0,0 +1,275 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret, intlv_low_bit + intlv_bits - 1 + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + ) + + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64(), + dram_2=HBM_2000_4H_1x64(), + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_ranges): + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + + def set_vertex_pch_bit(self, pch_bit): + self.vertex_mem_ctrl.pch_bit = pch_bit + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start=0, size="16GiB"), + in_addr_map=False, + ), + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + +class SEGA(System): + def __init__( + self, + num_gpts, + num_registers, + cache_size, + graph_path, + ): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("4GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges, pch_bit = interleave_addresses( + AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range( + [vertex_ranges[i], vertex_ranges[i + num_gpts]] + ) + gpt.set_vertex_pch_bit(pch_bit) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py new file mode 100644 index 0000000000..08f0f181ba --- /dev/null +++ b/configs/accl/sega_simple.py @@ -0,0 +1,267 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="120ns", bandwidth="256GiB/s" + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start=0, size="16GiB"), + in_addr_map=False, + ), + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + +class SEGA(System): + def __init__(self, num_gpts, num_registers, cache_size, + r_queue_size, r_latency, graph_path): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("4GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + self.ctrl.router_vector = [] + + # self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + # self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + # self.ctrl.router_vector = [] + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() \ No newline at end of file diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py new file mode 100644 index 0000000000..5b7309d44f --- /dev/null +++ b/configs/accl/sega_simple_pt2pt.py @@ -0,0 +1,302 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="120ns", bandwidth="256GiB/s" + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start=0, size="16GiB"), + in_addr_map=False, + ), + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + def set_router_vector(self, router_vector): + self.controller.router_vector = router_vector + + +class SEGA(System): + def __init__(self, num_gpts, num_registers, cache_size, + r_queue_size, r_latency, gpt_per_gpn, graph_path, sample_time, tokens): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + gpts = [] + routers = [] + GPTPerGPN = gpt_per_gpn + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("16GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), num_gpts, 32 + ) + + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + + # Creating the interconnect among mpus + # for gpt_0 in gpts: + # for gpt_1 in gpts: + # gpt_0.setReqPort(gpt_1.getRespPort()) + + for i in range(int(num_gpts/GPTPerGPN)): + routers.append(RouterEngine( + gpn_queue_size = r_queue_size, + gpt_queue_size = r_queue_size, + router_latency = r_latency)) + + self.routers = routers + # for gpt_0 in gpts: + # for gpt_1 in gpts: + # gpt_0.setReqPort(gpt_1.getRespPort()) + print("gpt, gpt") + for i in range(len(gpts)): + for j in range(len(gpts)): + if (int(i / GPTPerGPN) == int(j / GPTPerGPN) ): + # print(i, j) + gpts[i].setReqPort(gpts[j].getRespPort()) + # print("gpt, Router") + for i in range(len(gpts)): + for j in range(len(routers)): + if (int(i / GPTPerGPN) == j): + # print(i, j) + gpts[i].setRespPort(routers[j].gpt_req_side) + gpts[i].setReqPort(routers[j].gpt_resp_side) + # print("router, router") + for r_0 in routers: + for r_1 in routers: + if r_0 != r_1: + # print(r_0, r_1) + r_0.gpn_resp_side = r_1.gpn_req_side + self.gpts = gpts + self.routers = routers + + self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + self.ctrl.set_router_vector([r for r in self.routers]) + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py new file mode 100644 index 0000000000..08581bbb81 --- /dev/null +++ b/configs/accl/sssp.py @@ -0,0 +1,156 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("r_queue_size", type=int) + argparser.add_argument("r_latency", type=int) + argparser.add_argument("gpt_per_gpn", type=int) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument("sample_time", type=str) + argparser.add_argument("tokens", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--pt2pt", + dest="pt2pt", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.r_queue_size, + args.r_latency, + args.gpt_per_gpn, + args.graph, + args.init_addr, + args.init_value, + args.sample_time, + args.tokens, + args.simple, + args.pt2pt, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + r_queue_size, + r_latency, + gpt_per_gpn, + graph, + init_addr, + init_value, + sample_time, + tokens, + simple, + pt2pt, + sample, + verify, + ) = get_inputs() + + if simple: + if pt2pt: + from sega_simple_pt2pt import SEGA + system = SEGA(num_gpts, num_registers, cache_size, + r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens) + else: + from sega_simple import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_sssp_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md new file mode 100644 index 0000000000..ebfca7e794 --- /dev/null +++ b/src/accl/graph/TODO.md @@ -0,0 +1,8 @@ +# TODO Items + +* We might need to revisit the fact that we could insert something to a queue on + the same cycle that another event is consuming something from the queue. +* Move checking for wl.degree == 0 to coalesce engine. +* Fix the retry system between memory queue and coalesce engine +* Update inheritance: There is not enough reason for PushEngine and +CoalesceEngine to be of the same type (i.e. delete BaseMemEngine). diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py new file mode 100644 index 0000000000..0585c36e48 --- /dev/null +++ b/src/accl/graph/base/BaseReduceEngine.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseReduceEngine(ClockedObject): + abstract = True + type = 'BaseReduceEngine' + cxx_header = "accl/graph/base/base_reduce_engine.hh" + cxx_class = 'gem5::BaseReduceEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript new file mode 100644 index 0000000000..35111c34d2 --- /dev/null +++ b/src/accl/graph/base/SConscript @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"]) + +Source("base_reduce_engine.cc") +Source("graph_workload.cc") diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc new file mode 100644 index 0000000000..ade95800d2 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/base_reduce_engine.hh" + +namespace gem5 +{ + +BaseReduceEngine::BaseReduceEngine(const Params ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)) +{} + +BaseReduceEngine::~BaseReduceEngine() +{} + +} diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh new file mode 100644 index 0000000000..268bb60b76 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ + +#include "params/BaseReduceEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseReduceEngine : public ClockedObject +{ + private: + System* system; + + protected: + + const RequestorID _requestorId; + + public: + PARAMS(BaseReduceEngine); + BaseReduceEngine(const Params ¶ms); + ~BaseReduceEngine(); + + RequestorID requestorId() { return _requestorId; } +}; + +} + +#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh new file mode 100644 index 0000000000..f1a26f6ac2 --- /dev/null +++ b/src/accl/graph/base/data_structs.hh @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ +#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ + +#include "base/cprintf.hh" +#include "base/intmath.hh" + +#include +#include +#include +#include + +namespace gem5 +{ + +struct __attribute__ ((packed)) WorkListItem +{ + uint32_t tempProp : 32; + uint32_t prop : 32; + uint32_t edgeIndex : 32; + uint32_t degree : 30; + bool activeNow: 1; + bool activeFuture: 1; + + std::string to_string() + { + return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " + "degree: %u, activeNow: %s, activeFuture: %s}", + tempProp, prop, edgeIndex, degree, + activeNow ? "true" : "false", + activeFuture ? "true" : "false"); + } + + WorkListItem(): + tempProp(0), + prop(0), + edgeIndex(0), + degree(0), + activeNow(false), + activeFuture(false) + {} + + WorkListItem(uint32_t temp_prop, uint32_t prop, + uint32_t degree, uint32_t edge_index, + bool active_now, bool active_future): + tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree), + activeNow(active_now), activeFuture(active_future) + {} + +}; + +struct __attribute__ ((packed)) Edge +{ + uint16_t weight : 16; + uint64_t neighbor : 48; + + std::string to_string() + { + return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); + } + + Edge(): weight(0), neighbor(0) {} + + Edge(uint16_t weight, uint64_t neighbor): + weight(weight), + neighbor(neighbor) + {} +}; + +struct __attribute__ ((packed)) MirrorVertex +{ + uint32_t vertexId : 32; + uint32_t prop : 32; + uint32_t edgeIndex : 32; + uint32_t degree : 30; + bool activeNow: 1; + bool activeNext: 1; + + std::string to_string() + { + return csprintf("MirrorVertex{vertexId: %u, prop: %u, edgeIndex: %u, " + "degree: %u, activeNow: %s, activeNext: %s}", + vertexId, prop, edgeIndex, degree, + activeNow ? "true" : "false", + activeNext ? "true" : "false"); + } + MirrorVertex(): + vertexId(-1), + prop(-1), + edgeIndex(-1), + degree(-1), + activeNow(false), + activeNext(false) + {} + + MirrorVertex(uint32_t vertex_id, uint32_t prop, uint32_t degree, + uint32_t edge_index, bool active_now, bool active_next): + vertexId(vertex_id), prop(prop), edgeIndex(edge_index), + degree(degree), activeNow(active_now), activeNext(active_next) + {} + +}; + +static_assert(isPowerOf2(sizeof(WorkListItem))); +static_assert(isPowerOf2(sizeof(Edge))); +static_assert(isPowerOf2(sizeof(MirrorVertex))); + +struct MetaEdge { + uint64_t src; + uint64_t dst; + uint32_t weight; + uint32_t value; + + MetaEdge(): src(0), dst(0), weight(0), value(0) + {} + MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): + src(src), dst(dst), weight(weight), value(value) + {} + + std::string to_string() + { + return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}", + src, dst, weight, value); + } +}; + +struct Update { + uint64_t src; + uint64_t dst; + uint32_t value; + + Update(): src(0), dst(0), value(0) + {} + Update(uint64_t src, uint64_t dst, uint32_t value): + src(src), dst(dst), value(value) + {} + + std::string to_string() + { + return csprintf("Update{src: %lu, dst:%lu, value: %u}", + src, dst, value); + } +}; + +template +class UniqueFIFO +{ + private: + int cap; + int pop; + + int* added; + int* deleted; + std::deque container; + + public: + UniqueFIFO() { + cap = 0; + pop = 0; + added = nullptr; + deleted = nullptr; + } + + UniqueFIFO(int size) { + cap = size; + pop = 0; + + added = (int*) new int [cap]; + deleted = (int*) new int [cap]; + + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); + } + + ~UniqueFIFO() { + delete [] added; + delete [] deleted; + } + + void fix_front() { + while(true) { + T elem = container.front(); + if (deleted[elem] > 0) { + deleted[elem]--; + added[elem]--; + container.pop_front(); + } else { + assert(deleted[elem] == 0); + assert(added[elem] == 1); + break; + } + } + } + + T front() { + fix_front(); + return container.front(); + } + + size_t size() { + return pop; + } + + void clear() { + pop = 0; + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); + } + + bool empty() { + return size() == 0; + } + + bool find(T item) { + assert(added[item] >= 0); + assert(deleted[item] >= 0); + int diff = added[item] - deleted[item]; + assert((diff == 0) || (diff == 1)); + return (diff == 1); + } + + void push_back(T item) { + if (!find(item)) { + added[item]++; + pop++; + container.push_back(item); + } + } + + void pop_front() { + T elem = front(); + assert(added[elem] == 1); + added[elem] = 0; + pop--; + container.pop_front(); + } + + void erase(T item) { + assert(find(item)); + deleted[item]++; + pop--; + } + + void operator=(const UniqueFIFO& rhs) { + cap = rhs.cap; + pop = rhs.pop; + container = rhs.container; + added = (int*) new int [cap]; + deleted = (int*) new int [cap]; + std::memcpy(added, rhs.added, cap * sizeof(int)); + std::memcpy(deleted, rhs.deleted, cap * sizeof(int)); + } +}; + +} + +#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc new file mode 100644 index 0000000000..fd802cf275 --- /dev/null +++ b/src/accl/graph/base/graph_workload.cc @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/graph_workload.hh" + +#include + +#include "base/cprintf.hh" +#include "base/intmath.hh" + +namespace gem5 +{ + +template +float +writeToFloat(T value) +{ + assert(sizeof(T) == sizeof(float)); + float float_form; + std::memcpy(&float_form, &value, sizeof(float)); + return float_form; +} + +template +T +readFromFloat(float value) +{ + assert(sizeof(T) == sizeof(float)); + T float_bits; + std::memcpy(&float_bits, &value, sizeof(float)); + return float_bits; +} + +void +BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + uint64_t aligned_addr = roundDown(initAddr, pkt_size); + + if (pkt->getAddr() == aligned_addr) { + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); + WorkListItem new_wl = items[index]; + new_wl.tempProp = initValue; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BFSWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +BFSWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + 1; +} + +bool +BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0); +} + +uint32_t +BFSWorkload::apply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.prop; +} + +std::string +BFSWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +uint32_t +BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) { + return value; +} + +void +CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + Addr pkt_addr = pkt->getAddr(); + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i; + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +SSSPWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + weight; +} + +void +PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int num_elements = pkt->getSize() / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt->getSize()); + + bool atom_active = false; + for (int index = 0; index < num_elements; index++) { + WorkListItem new_wl = items[index]; + new_wl.tempProp = readFromFloat(0); + new_wl.prop = readFromFloat(1 - alpha); + new_wl.activeNow = activeCondition(new_wl, items[index]); + atom_active |= new_wl.activeNow; + items[index] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt->getSize()); +} + +uint32_t +PRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +PRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + float weight_float = writeToFloat(weight); + if (weight == 0) { + weight_float = 1.0; + } + return readFromFloat(alpha * value_float * weight_float); +} + +bool +PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + float temp_float = writeToFloat(new_wl.tempProp); + float prop_float = writeToFloat(new_wl.prop); + float dist = std::abs(temp_float - prop_float); + return (dist >= threshold) && (new_wl.degree > 0); +} + +uint32_t +PRWorkload::apply(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = (temp_float - prop_float) / wl.degree; + wl.prop = wl.tempProp; + return readFromFloat(delta); +} + +std::string +PRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +void +BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = readFromFloat((1 - alpha)/numNodes); + new_wl.prop = readFromFloat(1/numNodes); + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +BSPPRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +BSPPRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + return readFromFloat(alpha * value_float); +} + +bool +BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + return (old_wl.degree > 0); +} + +uint32_t +BSPPRWorkload::apply(WorkListItem& wl) +{ + float prop_float = writeToFloat(wl.prop); + float delta = prop_float / wl.degree; + uint32_t delta_uint = readFromFloat(delta); + return delta_uint; +} + +void +BSPPRWorkload::interIterationInit(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + error += std::abs(temp_float - prop_float); + wl.prop = wl.tempProp; + wl.tempProp = readFromFloat((1 - alpha) / numNodes); + wl.activeFuture = (wl.degree > 0); +} + +std::string +BSPPRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +void +BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int pkt_size = pkt->getSize(); + int aligned_addr = roundDown(initAddr, pkt_size); + + if (aligned_addr == pkt->getAddr()) { + int num_elements = pkt_size / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (initAddr - aligned_addr) / sizeof(WorkListItem); + WorkListItem new_wl = items[index]; + uint32_t prop = 0; + prop |= initValue; + // NOTE: Depth of the initial vertex is 0. + prop &= countMask; + new_wl.tempProp = prop; + new_wl.prop = prop; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BSPBCWorkload::reduce(uint32_t update, uint32_t value) +{ + uint32_t update_depth = (update & depthMask) >> 24; + uint32_t update_count = (update & countMask); + uint32_t value_depth = (value & depthMask) >> 24; + uint32_t value_count = (value & countMask); + if (value_depth == 255) { + value_depth = currentDepth; + value_count = 0; + } + if (value_depth == currentDepth) { + value_count += update_count; + } + uint32_t ret = 0; + ret |= value_count; + warn_if(value_count > 16777215, "value count has grown bigger than 16777125." + " This means the algorithm result might not be correct." + " However, the traversal will not be affected." + " Therefore, performane metrics could be used."); + // HACK: Make sure to always set the depth correctly even if count + // exceeds the 2^24-1 limit. Here we reset the depth section of ret. + ret &= countMask; + // NOTE: Now that the depth is securely reset we can copy the correct value. + ret |= (value_depth << 24); + return ret; +} + +uint32_t +BSPBCWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value; +} + +uint32_t +BSPBCWorkload::apply(WorkListItem& wl) +{ + return wl.prop; +} + +void +BSPBCWorkload::interIterationInit(WorkListItem& wl) +{ + wl.prop = wl.tempProp; +} + +bool +BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + uint32_t depth = (new_wl.tempProp & depthMask) >> 24; + return (depth == currentDepth) && (new_wl.degree > 0); +} + +std::string +BSPBCWorkload::printWorkListItem(WorkListItem wl) +{ + uint32_t temp_depth = (wl.tempProp & depthMask) >> 24; + uint32_t temp_count = (wl.tempProp & countMask); + uint32_t depth = (wl.prop & depthMask) >> 24; + uint32_t count = (wl.prop & countMask); + return csprintf( + "WorkListItem{tempProp: (depth: %d, count: %d), " + "prop: (depth: %d, count: %d), degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +} // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh new file mode 100644 index 0000000000..481cfc146f --- /dev/null +++ b/src/accl/graph/base/graph_workload.hh @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ +#define __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ + +#include +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/work_directory.hh" +#include "mem/packet.hh" + + +namespace gem5 +{ + +class GraphWorkload +{ + public: + GraphWorkload() {} + ~GraphWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0; + virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; + virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; + virtual uint32_t apply(WorkListItem& wl) = 0; + virtual bool betterThan(uint32_t lhs, uint32_t rhs) { return true; } + virtual void iterate() = 0; + virtual void interIterationInit(WorkListItem& wl) = 0; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0; + virtual std::string printWorkListItem(const WorkListItem wl) = 0; +}; + +class BFSWorkload : public GraphWorkload +{ + private: + uint64_t initAddr; + uint32_t initValue; + + public: + BFSWorkload(uint64_t init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value) + {} + + ~BFSWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual bool betterThan(uint32_t lhs, uint32_t rhs) override { return lhs < rhs; } + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {} + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class BFSVisitedWorkload : public BFSWorkload +{ + public: + BFSVisitedWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) + {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; + +class CCWorkload : public BFSVisitedWorkload +{ + public: + CCWorkload(): BFSVisitedWorkload(0, 0) {} + virtual void init(PacketPtr pkt, WorkDirectory* dir); +}; + +class SSSPWorkload : public BFSWorkload +{ + public: + SSSPWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) + {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; + +class PRWorkload : public GraphWorkload +{ + private: + float alpha; + float threshold; + + public: + PRWorkload(float alpha, float threshold): + alpha(alpha), threshold(threshold) + {} + + ~PRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {}; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class BSPPRWorkload : public GraphWorkload +{ + private: + int numNodes; + float alpha; + float prevError; + float error; + + public: + BSPPRWorkload(int num_nodes, float alpha): + numNodes(num_nodes), alpha(alpha), prevError(0), error(0) + {} + + ~BSPPRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { prevError = error; error = 0; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); + + float getError() { return prevError; } +}; + +class BSPBCWorkload : public GraphWorkload +{ + private: + Addr initAddr; + uint32_t initValue; + + int currentDepth; + + uint32_t depthMask; + uint32_t countMask; + public: + BSPBCWorkload(Addr init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value), + currentDepth(0), depthMask(4278190080), countMask(16777215) + {} + + ~BSPBCWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { currentDepth++; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +} + +#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py new file mode 100644 index 0000000000..10d8b708f0 --- /dev/null +++ b/src/accl/graph/sega/BaseMemoryEngine.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseMemoryEngine(ClockedObject): + abstract = True + type = 'BaseMemoryEngine' + cxx_header = "accl/graph/sega/base_memory_engine.hh" + cxx_class = 'gem5::BaseMemoryEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') + mem_port = RequestPort("Port to communicate with the memory") + + attached_memory_atom_size = Param.Int(64, "The atom size of the attached " + "memory.") diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py new file mode 100644 index 0000000000..619e76f1ee --- /dev/null +++ b/src/accl/graph/sega/CenteralController.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.util.pybind import PyBindMethod +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class CenteralController(BaseMemoryEngine): + type = 'CenteralController' + cxx_header = "accl/graph/sega/centeral_controller.hh" + cxx_class = 'gem5::CenteralController' + + mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.") + + choose_best = Param.Bool("Whether to prefer the best update " + "value for choosing the next slice") + + vertex_image_file = Param.String("Path to the vertex image file.") + + mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.") + + mpu_vector = VectorParam.MPU("All mpus in the system.") + + router_vector = VectorParam.RouterEngine("All Routers in the system.") + + cxx_exports = [ + PyBindMethod("setAsyncMode"), + PyBindMethod("setBSPMode"), + PyBindMethod("setPGMode"), + PyBindMethod("createPopCountDirectory"), + PyBindMethod("createBFSWorkload"), + PyBindMethod("createBFSVisitedWorkload"), + PyBindMethod("createSSSPWorkload"), + PyBindMethod("createCCWorkload"), + PyBindMethod("createAsyncPRWorkload"), + PyBindMethod("createPRWorkload"), + PyBindMethod("createBCWorkload"), + PyBindMethod("workCount"), + PyBindMethod("getPRError"), + PyBindMethod("printAnswerToHostSimout") + ] diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py new file mode 100644 index 0000000000..bb45802c1d --- /dev/null +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class CoalesceEngine(BaseMemoryEngine): + type = 'CoalesceEngine' + cxx_header = "accl/graph/sega/coalesce_engine.hh" + cxx_class = 'gem5::CoalesceEngine' + + cache_size = Param.MemorySize("Size of the internal SRAM array.") + + max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " + "requestor in each cycle. Used to limit b/w.") + pending_pull_limit = Param.Int("Maximum number of pending pull processes.") + active_buffer_size = Param.Int("Maximum number of memory active memory " + "atoms ready to send updates. This parameter " + "and post_push_wb_queue_size should be set " + "in tandem. Probably, they should be equal.") + post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " + "apply process for applications that require " + "the apply process to happen exactly before " + "pushing the edgePointer to the PushEngine.") + transitions_per_cycle = Param.Int("Max number of transitions in a cycle") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py new file mode 100644 index 0000000000..8d2453b01c --- /dev/null +++ b/src/accl/graph/sega/MPU.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject + +class MPU(SimObject): + type = "MPU" + cxx_header = "accl/graph/sega/mpu.hh" + cxx_class = "gem5::MPU" + + system = Param.System(Parent.any, "System this MPU is a part of") + + wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " + "MPU object.") + coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " + "each instance of MPU object.") + push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " + "instance of MPU object.") + diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py new file mode 100644 index 0000000000..2174f943f4 --- /dev/null +++ b/src/accl/graph/sega/PushEngine.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class PushEngine(BaseMemoryEngine): + type = 'PushEngine' + cxx_header = "accl/graph/sega/push_engine.hh" + cxx_class = 'gem5::PushEngine' + + push_req_queue_size = Param.Int("Size of the queue to " + "queue push requests.") + # resp_queue_size should probably be + # significantly bigger than push_req_queue_size + resp_queue_size = Param.Int("Size of the response queue in the " + "push engine where it stores the " + "edges read from memory.") + + examine_window = Param.Int("Number of edges at the front of the edge queue" + " to examine in order to propagate.") + + max_propagates_per_cycle = Param.Int("Maximum number of propagates " + "done per cycle.") + + update_queue_size = Param.Int("Maximum number of entries " + "for each update queue.") + + out_ports = VectorRequestPort("Outgoing ports to all MPUs") diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py new file mode 100644 index 0000000000..2b895b9323 --- /dev/null +++ b/src/accl/graph/sega/RouterEngine.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class RouterEngine(ClockedObject): + type = "RouterEngine" + cxx_header = "accl/graph/sega/router_engine.hh" + cxx_class = "gem5::RouterEngine" + + system = Param.System(Parent.any, "System this Engine is a part of") + + gpt_req_side = VectorRequestPort("Outgoing ports to local GPTs") + gpt_resp_side = VectorResponsePort("incoming ports from local GPTs") + + gpn_req_side = VectorRequestPort("Outgoing ports to remote GPNs") + gpn_resp_side = VectorResponsePort("incoming ports from local GPNs") + gpt_queue_size = Param.Int(64, "Queue size on the gpt side") + gpn_queue_size = Param.Int(64, "Queue size on the gpt side") + token = Param.Int("Number of tokens sent per time sample.") + router_latency = Param.Cycles(5, "Router latency, " + "SerDes or E-O-E latencies can be added here") + + sample_time = Param.Latency("50us", "Intervals to sample traffic") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript new file mode 100644 index 0000000000..e0a3f8d28f --- /dev/null +++ b/src/accl/graph/sega/SConscript @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"]) +SimObject("CenteralController.py", sim_objects=["CenteralController"]) +SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"]) +SimObject("MPU.py", sim_objects=["MPU"]) +SimObject("PushEngine.py", sim_objects=["PushEngine"]) +SimObject("WLEngine.py", sim_objects=["WLEngine"]) +SimObject("RouterEngine.py", sim_objects=["RouterEngine"]) + +Source("base_memory_engine.cc") +Source("centeral_controller.cc") +Source("coalesce_engine.cc") +Source("enums.cc") +Source("mpu.cc") +Source("push_engine.cc") +Source("wl_engine.cc") +Source("router_engine.cc") + +DebugFlag("BaseMemoryEngine") +DebugFlag("CenteralController") +DebugFlag("CacheBlockState") +DebugFlag("CoalesceEngine") +DebugFlag("PushEngine") +DebugFlag("SEGAStructureSize") +DebugFlag("MSDebug") +DebugFlag("WLEngine") +DebugFlag("RouterEngine") + +CompoundFlag("MPU", ["CoalesceEngine", "PushEngine", + "WLEngine", "BaseMemoryEngine", "RouterEngine"]) diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py new file mode 100644 index 0000000000..f9ea4488df --- /dev/null +++ b/src/accl/graph/sega/WLEngine.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseReduceEngine import BaseReduceEngine + +class WLEngine(BaseReduceEngine): + type = 'WLEngine' + cxx_header = "accl/graph/sega/wl_engine.hh" + cxx_class = 'gem5::WLEngine' + + in_ports = VectorResponsePort("Incoming Ports to receive updates from " + "remote outside") + + update_queue_size = Param.Int("Size of the queue WLEngine stores " + "the incoming updates") + + register_file_size = Param.Int("Number of internal registers the " + "WLEngine has. It can service as " + "many updates as this queueu has " + "entries at the same time.") + + examine_window = Param.Int("Number of updates at the front of update " + "queue examined for reading.") + rd_per_cycle = Param.Int("Maximum number of reads per cycle.") + reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.") + wr_per_cycle = Param.Int("Maximum number of writes per cycle.") + + diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc new file mode 100644 index 0000000000..9f704f71e9 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/base_memory_engine.hh" + +#include "debug/BaseMemoryEngine.hh" +#include "debug/SEGAStructureSize.hh" + +namespace gem5 +{ + +BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)), + memPort(name() + ".mem_port", this), + peerMemoryAtomSize(params.attached_memory_atom_size) +{} + +BaseMemoryEngine::~BaseMemoryEngine() +{} + +Port& +BaseMemoryEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "mem_port") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +BaseMemoryEngine::init() +{ + AddrRangeList memory_ranges = memPort.getAddrRanges(); + + assert(memory_ranges.size() == 1); + + peerMemoryRange = memory_ranges.front(); + + DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is " + "%s. The range is %s interleaved.\n", __func__, + peerMemoryRange.to_string(), + peerMemoryRange.interleaved() ? "" : "not"); +} + +void +BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to " + "the memory.\n", __func__, pkt->print()); + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__); + } else { + DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__); + owner->recvMemRetry(); + } +} + +bool +BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +void +BaseMemoryEngine::MemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), + "Received retry without a blockedPacket"); + + _blocked = false; + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); +} + +PacketPtr +BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + +PacketPtr +BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + +} diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh new file mode 100644 index 0000000000..afe7fd0433 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ + +#include + +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/BaseMemoryEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseMemoryEngine : public ClockedObject +{ + protected: + class MemoryEvent : public EventFunctionWrapper + { + private: + bool _pending; + int _prevState; + + public: + MemoryEvent(const std::function &callback, + const std::string &name): + EventFunctionWrapper(callback, name), + _pending(false), _prevState(0) + {} + bool pending() { return _pending; } + void sleep() { _pending = true; } + void wake() { _pending = false; } + void setPrevState(int state) { _prevState = state; } + int getPrevState() { return _prevState; } + }; + + class MemPort : public RequestPort + { + private: + BaseMemoryEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, BaseMemoryEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + System* system; + const RequestorID _requestorId; + + MemPort memPort; + AddrRange peerMemoryRange; + size_t peerMemoryAtomSize; + + virtual void recvMemRetry() = 0; + virtual bool handleMemResp(PacketPtr pkt) = 0; + + PacketPtr createReadPacket(Addr addr, unsigned int size); + PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); + + public: + PARAMS(BaseMemoryEngine); + + BaseMemoryEngine(const Params ¶ms); + ~BaseMemoryEngine(); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + + AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); } + + virtual void recvFunctional(PacketPtr pkt) = 0; + + virtual void init() override; +}; + +} + +#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr new file mode 100644 index 0000000000..316fcd37d9 --- /dev/null +++ b/src/accl/graph/sega/busyMaskErr @@ -0,0 +1,16 @@ +gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0 + +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. +32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. + +// This assertion would be hit although it should not. +// It is fixed by a hack in recvWLRead when hit in the cache. +assert(cacheBlocks[block_index].busyMask == 0); diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc new file mode 100644 index 0000000000..09b57b6ff6 --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.cc @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/centeral_controller.hh" + +#include +#include + +#include "base/cprintf.hh" +#include "base/loader/memory_image.hh" +#include "base/loader/object_file.hh" +#include "debug/CenteralController.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CenteralController::CenteralController(const Params& params): + BaseMemoryEngine(params), + mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET), + mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0), + chooseBest(params.choose_best), + nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()), + stats(*this) +{ + uint64_t total_cache_size = 0; + for (auto mpu : params.mpu_vector) { + mpuVector.push_back(mpu); + mpu->registerCenteralController(this); + total_cache_size += mpu->getCacheSize(); + } + + // for (auto router : params.router_vector) { + // routerVector.push_back(router); + // router->registerCenteralController(this); + // } + verticesPerSlice = std::floor(total_cache_size / sizeof(WorkListItem)); +} + +Port& +CenteralController::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "mirrors_map_mem") { + return mapPort; + } else if (if_name == "mem_port") { + return BaseMemoryEngine::getPort("mem_port", idx); + } else { + return ClockedObject::getPort(if_name, idx); + } + + for (auto router : params().router_vector) { + routerVector.push_back(router); + router->registerCenteralController(this); + } +} + +void +CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSWorkload(init_addr, init_value); +} + +void +CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSVisitedWorkload(init_addr, init_value); +} + +void +CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new SSSPWorkload(init_addr, init_value); +} + +void +CenteralController::createCCWorkload() +{ + workload = new CCWorkload(); +} + +void +CenteralController::createAsyncPRWorkload(float alpha, float threshold) +{ + workload = new PRWorkload(alpha, threshold); +} + +void +CenteralController::createPRWorkload(int num_nodes, float alpha) +{ + workload = new BSPPRWorkload(num_nodes, alpha); +} + +void +CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BSPBCWorkload(init_addr, init_value); +} + +void +CenteralController::createPopCountDirectory(int atoms_per_block) +{ + fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing " + "mode by calling either setAsyncMode or setBSPMode."); + if (mode == ProcessingMode::ASYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createAsyncPopCountDirectory(atoms_per_block); + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createBSPPopCountDirectory(atoms_per_block); + } + } + if (mode == ProcessingMode::POLY_GRAPH) { + for (auto mpu: mpuVector) { + mpu->createAsyncPopCountDirectory(atoms_per_block); + } + } +} + +void +CenteralController::startup() +{ + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + for (auto mpu: mpuVector) { + for (auto range: mpu->getAddrRanges()) { + mpuAddrMap.insert(range, mpu); + } + mpu->setProcessingMode(mode); + mpu->recvWorkload(workload); + } + + const auto& vertex_file = params().vertex_image_file; + if (vertex_file == "") + return; + + auto* object = loader::createObjectFile(vertex_file, true); + fatal_if(!object, "%s: Could not load %s.", name(), vertex_file); + + loader::debugSymbolTable.insert(*object->symtab().globals()); + loader::MemoryImage vertex_image = object->buildImage(); + maxVertexAddr = vertex_image.maxAddr(); + + int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem)); + numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice); + + numPendingUpdates = new int [numTotalSlices]; + bestPendingUpdate = new uint32_t [numTotalSlices]; + for (int i = 0; i < numTotalSlices; i++) { + numPendingUpdates[i] = 0; + bestPendingUpdate[i] = -1; + } + + PortProxy vertex_proxy( + [this](PacketPtr pkt) { + auto routing_entry = mpuAddrMap.contains(pkt->getAddr()); + routing_entry->second->recvFunctional(pkt); + }, vertex_atom); + + panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); + + for (auto mpu: mpuVector) { + mpu->postMemInitSetup(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } + workload->iterate(); +} + +void +CenteralController::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + DPRINTF(CenteralController, "%s: Port %d: Packet %s " + "is blocked.\n", __func__, _id, pkt->print()); + blockedPacket = pkt; + } else { + DPRINTF(CenteralController, "%s: Port %d: Packet %s " + "sent.\n", __func__, _id, pkt->print()); + } +} + +bool +CenteralController::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp should not be called at all"); +} + +void +CenteralController::ReqPort::recvReqRetry() +{ + panic("recvReqRetry should not be called at all"); +} + +void +CenteralController::recvDoneSignal() +{ + bool done = true; + for (auto mpu : mpuVector) { + done &= mpu->done(); + } + + // for (auto router : routerVector) { + // done &= router->done(); + // } + + if (done && mode == ProcessingMode::ASYNCHRONOUS) { + exitSimLoopNow("no update left to process."); + } + + if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->postConsumeProcess(); + mpu->swapDirectories(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } + workload->iterate(); + exitSimLoopNow("finished an iteration."); + } + + if (done && mode == ProcessingMode::POLY_GRAPH) { + DPRINTF(CenteralController, "%s: Received done signal.\n", __func__); + exitSimLoopNow("Finished processing a slice."); + if (!nextSliceSwitchEvent.scheduled()) { + schedule(nextSliceSwitchEvent, nextCycle()); + } + } +} + +int +CenteralController::chooseNextSlice() +{ + int ret_slice_id = -1; + int max_pending_count = 0; + // TODO: Make this generalizable for all workloads. + uint32_t best_update = -1; + for (int i = 0; i < numTotalSlices; i++) { + if (numPendingUpdates[i] > max_pending_count) { + max_pending_count = numPendingUpdates[i]; + } + if (numPendingUpdates[i] > 0 && + workload->betterThan(bestPendingUpdate[i], best_update)) { + best_update = bestPendingUpdate[i]; + } + } + if (chooseBest) { + int max_count = 0; + for (int i = 0; i < numTotalSlices; i++) { + if (numPendingUpdates[i] > max_count && + bestPendingUpdate[i] == best_update) { + max_count = numPendingUpdates[i]; + ret_slice_id = i; + } + } + } else { + uint32_t best_value = -1; + for (int i = 0; i < numTotalSlices; i++) { + if (numPendingUpdates[i] == max_pending_count && + workload->betterThan(bestPendingUpdate[i], best_value)) { + best_value = bestPendingUpdate[i]; + ret_slice_id = i; + } + } + } + return ret_slice_id; +} + +void +CenteralController::processNextSliceSwitchEvent() +{ + int vertex_atom = mpuVector.front()->vertexAtomSize(); + int vertices_per_atom = (int) vertex_atom / sizeof(WorkListItem); + int bytes_accessed = 0; + int updates_generated_total = 0; + for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) { + if (dst_id == currentSliceId) { + continue; + } + int updates_generated = 0; + Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t); + Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t); + PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t)); + PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t)); + mapPort.sendFunctional(start); + mapPort.sendFunctional(end); + Addr start_addr = start->getLE(); + Addr end_addr = end->getLE(); + delete start; + delete end; + DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__, + currentSliceId, dst_id, start_addr, end_addr); + + int num_bytes = end_addr - start_addr; + int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex); + MirrorVertex* mirrors = new MirrorVertex [num_mirrors]; + + PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes); + memPort.sendFunctional(read_mirrors); + read_mirrors->writeData((uint8_t*) mirrors); + delete read_mirrors; + + WorkListItem vertices [vertices_per_atom]; + for (int i = 0; i < num_mirrors; i++) { + Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem); + Addr aligned_org_addr = roundDown(org_addr, vertex_atom); + int wl_offset = (int) (org_addr - aligned_org_addr) / sizeof(WorkListItem); + PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom); + auto routing_entry = mpuAddrMap.contains(aligned_org_addr); + routing_entry->second->recvFunctional(read_org); + read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom); + delete read_org; + if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) { + assert(vertices[wl_offset].degree == 0); + vertices[wl_offset].prop = vertices[wl_offset].tempProp; + } + if (mirrors[i].prop != vertices[wl_offset].prop) { + mirrors[i].prop = vertices[wl_offset].prop; + if (!mirrors[i].activeNow) { + mirrors[i].activeNow = true; + numPendingUpdates[dst_id]++; + totalUpdatesLeft++; + updates_generated++; + } + bestPendingUpdate[dst_id] = + workload->betterThan(mirrors[i].prop, bestPendingUpdate[dst_id]); + } + } + PacketPtr write_mirrors = + createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors); + memPort.sendFunctional(write_mirrors); + delete write_mirrors; + delete [] mirrors; + DPRINTF(CenteralController, "%s: Done scattering updates from slice " + "%d to slice %d.\n", __func__, currentSliceId, dst_id); + DPRINTF(CenteralController, "%s: Generated %d updates from slice " + "%d to slice %d.\n", __func__, + updates_generated, currentSliceId, dst_id); + updates_generated_total += updates_generated; + bytes_accessed += 2 * num_bytes; + } + DPRINTF(CenteralController, "%s: Done with slice %d.\n", __func__, currentSliceId); + DPRINTF(CenteralController, "%s: Generated a total of %d updates.\n", + __func__, updates_generated_total); + DPRINTF(CenteralController, "%s: There are a total of %d " + "updates left.\n", __func__, totalUpdatesLeft); + if (totalUpdatesLeft > 0) { + currentSliceId = chooseNextSlice(); + } else { + exitSimLoopNow("Done with all the slices."); + return; + } + DPRINTF(CenteralController, "%s: Chose %d as the " + "next slice.\n", __func__, currentSliceId); + + for (int src_id = 0; src_id < numTotalSlices; src_id++) { + if (src_id == currentSliceId) { + continue; + } + Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t); + Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t); + PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t)); + PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t)); + mapPort.sendFunctional(start); + mapPort.sendFunctional(end); + Addr start_addr = start->getLE(); + Addr end_addr = end->getLE(); + delete start; + delete end; + + int num_bytes = end_addr - start_addr; + int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex); + MirrorVertex* mirrors = new MirrorVertex [num_mirrors]; + + PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes); + memPort.sendFunctional(read_mirrors); + read_mirrors->writeData((uint8_t*) mirrors); + delete read_mirrors; + for (int i = 0; i < num_mirrors; i++) { + if (mirrors[i].activeNow) { + Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem); + auto routing_entry = mpuAddrMap.contains(org_addr); + routing_entry->second->recvMirrorPush(org_addr, mirrors[i].prop, + mirrors[i].edgeIndex, mirrors[i].degree); + mirrors[i].activeNow = false; + numPendingUpdates[currentSliceId]--; + totalUpdatesLeft--; + } + } + PacketPtr write_mirrors = + createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors); + memPort.sendFunctional(write_mirrors); + delete write_mirrors; + delete [] mirrors; + DPRINTF(CenteralController, "%s: Done gathering updates from slice " + "%d to slice %d.\n", __func__, src_id, currentSliceId); + bytes_accessed += num_bytes; + } + + double mirror_mem_bw = mirrorsMem->getBW(); + Tick time_to_switch = bytes_accessed * mirror_mem_bw; + stats.switchTicks += time_to_switch; + stats.switchedBytes += bytes_accessed; + stats.numSwitches++; + for (auto mpu: mpuVector) { + mpu->startProcessingMirrors(time_to_switch); + } + exitSimLoopNow("Done with slice switch."); +} + +bool +CenteralController::handleMemResp(PacketPtr pkt) +{ + panic("handleMemResp should not be called at all"); +} + +void +CenteralController::recvMemRetry() +{ + panic("recvMemRetry should not be called at all"); +} + +void +CenteralController::recvFunctional(PacketPtr pkt) +{ + panic("recvFunctional should not be called at all"); +} + +int +CenteralController::workCount() +{ + int work_count = 0; + for (auto mpu: mpuVector) { + work_count += mpu->workCount(); + } + return work_count; +} + +float +CenteralController::getPRError() +{ + BSPPRWorkload* pr_workload = dynamic_cast(workload); + return pr_workload->getError(); +} + +void +CenteralController::printAnswerToHostSimout() +{ + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + int num_items = vertex_atom / sizeof(WorkListItem); + WorkListItem items[num_items]; + for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom) + { + PacketPtr pkt = createReadPacket(addr, vertex_atom); + auto routing_entry = mpuAddrMap.contains(pkt->getAddr()); + routing_entry->second->recvFunctional(pkt); + pkt->writeDataToBlock((uint8_t*) items, vertex_atom); + for (int i = 0; i < num_items; i++) { + std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, + workload->printWorkListItem(items[i])); + + std::cout << print << std::endl; + } + delete pkt; + } +} + +CenteralController::ControllerStats::ControllerStats(CenteralController& _ctrl): + statistics::Group(&_ctrl), ctrl(_ctrl), + ADD_STAT(numSwitches, statistics::units::Byte::get(), + "Number of slices switches completed."), + ADD_STAT(switchedBytes, statistics::units::Byte::get(), + "Number of bytes accessed during slice switching."), + ADD_STAT(switchTicks, statistics::units::Tick::get(), + "Number of ticks spent switching slices."), + ADD_STAT(switchSeconds, statistics::units::Second::get(), + "Traversed Edges Per Second.") +{ +} + +void +CenteralController::ControllerStats::regStats() +{ + using namespace statistics; + + switchSeconds = switchTicks / simFreq; +} + +} diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh new file mode 100644 index 0000000000..ac06b76edc --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.hh @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ +#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/mpu.hh" +#include "accl/graph/sega/router_engine.hh" +#include "base/addr_range.hh" +#include "base/intmath.hh" +#include "mem/simple_mem.hh" +#include "params/CenteralController.hh" + +namespace gem5 +{ + +class CenteralController : public BaseMemoryEngine +{ + private: + class ReqPort : public RequestPort + { + private: + CenteralController* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + ReqPort(const std::string& name, CenteralController* owner, PortID id): + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + ReqPort mapPort; + Addr maxVertexAddr; + ProcessingMode mode; + + memory::SimpleMemory* mirrorsMem; + + std::vector mpuVector; + AddrRangeMap mpuAddrMap; + std::vector routerVector; + + std::unordered_map addrRangeListMap; + + int currentSliceId; + int numTotalSlices; + int verticesPerSlice; + int totalUpdatesLeft; + + bool chooseBest; + int* numPendingUpdates; + uint32_t* bestPendingUpdate; + int chooseNextSlice(); + + EventFunctionWrapper nextSliceSwitchEvent; + void processNextSliceSwitchEvent(); + + struct ControllerStats : public statistics::Group + { + ControllerStats(CenteralController& ctrl); + + void regStats() override; + + CenteralController& ctrl; + + statistics::Scalar numSwitches; + statistics::Scalar switchedBytes; + statistics::Scalar switchTicks; + statistics::Formula switchSeconds; + }; + ControllerStats stats; + + protected: + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; + + public: + GraphWorkload* workload; + + PARAMS(CenteralController); + CenteralController(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + + virtual void startup() override; + + virtual void recvFunctional(PacketPtr pkt) override; + + void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; } + void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; } + void setPGMode() { mode = ProcessingMode::POLY_GRAPH; } + + void createPopCountDirectory(int atoms_per_block); + + void createBFSWorkload(Addr init_addr, uint32_t init_value); + void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value); + void createSSSPWorkload(Addr init_addr, uint32_t init_value); + void createCCWorkload(); + void createAsyncPRWorkload(float alpha, float threshold); + void createPRWorkload(int num_nodes, float alpha); + void createBCWorkload(Addr init_addr, uint32_t init_value); + + void recvDoneSignal(); + + int workCount(); + float getPRError(); + void printAnswerToHostSimout(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc new file mode 100644 index 0000000000..5e0c8c8095 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -0,0 +1,1322 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/coalesce_engine.hh" + +#include + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), + lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), + numReceivedPulls(0), numScheduledPulls(0), pendingPullLimit(params.pending_pull_limit), + pendingPullReads(0), activeBufferSize(params.active_buffer_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + transitionsPerCycle(params.transitions_per_cycle), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), + nextDoneSignalEvent([this] { + processNextDoneSignalEvent(); + }, name() + ".nextDoneSignalEvent"), + stats(*this) +{ + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); + } + numActiveBlocksNow = UniqueFIFO(numLines); + numActiveBlocksNext = UniqueFIFO(numLines); + + activeBuffer.clear(); + postPushWBQueue.clear(); + blocksTouchedThisTick.clear(); +} + +void +CoalesceEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + + +// NOTE: Used for initializing memory and reading the final answer +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].state == CacheState::IDLE); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + graphWorkload->init(pkt, currentDirectory); + if (pkt->getAddr() > lastAtomAddr) { + lastAtomAddr = pkt->getAddr(); + } + memPort.sendFunctional(pkt); + } +} + +void +CoalesceEngine::postMemInitSetup() +{ + currentDirectory->setLastAtomAddr(lastAtomAddr); +} + +void +CoalesceEngine::postConsumeProcess() +{ + Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); + for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { + Addr addr = peerMemoryRange.addIntlvBits(local_addr); + int block_index = getBlockIndex(addr); + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::IDLE); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; + graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; + if (cacheBlocks[block_index].items[index].activeFuture) { + cacheBlocks[block_index].items[index].activeFuture = false; + cacheBlocks[block_index].items[index].activeNow = true; + cacheBlocks[block_index].dirty = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + numActiveBlocksNext.push_back(block_index); + } + if (atom_active_future_before && !atom_active_future_after) { + numActiveBlocksNext.erase(block_index); + } + } else { + WorkListItem items[numElementsPerLine]; + PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); + memPort.sendFunctional(read_pkt); + read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!items[index].activeNow); + atom_active_future_before |= items[index].activeFuture; + graphWorkload->interIterationInit(items[index]); + atom_active_future_after |= items[index].activeFuture; + if (items[index].activeFuture) { + items[index].activeFuture = false; + items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureDirectory->activate(addr); + } + if (atom_active_future_before && !atom_active_future_after) { + futureDirectory->deactivate(addr); + } + PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); + memPort.sendFunctional(write_pkt); + delete read_pkt; + delete write_pkt; + } + } +} + +void +CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = nullptr; +} + +void +CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + +void +CoalesceEngine::swapDirectories() +{ + assert(currentDirectory->empty()); + assert(numActiveBlocksNow.empty()); + // assert currentDirectory is empty + WorkDirectory* temp = currentDirectory; + currentDirectory = futureDirectory; + futureDirectory = temp; + + numActiveBlocksNow.clear(); + numActiveBlocksNow = numActiveBlocksNext; + numActiveBlocksNext.clear(); +} + +bool +CoalesceEngine::done() +{ + return memAccBuffer.empty() && numActiveBlocksNow.empty() && + activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0); +} + +bool +CoalesceEngine::enoughSpace() +{ + return (activeBuffer.size() + pendingPullReads + numScheduledPulls) < activeBufferSize; +} + +bool +CoalesceEngine::pullCondition() +{ + bool enough_space = enoughSpace(); + bool schedule_limit = numScheduledPulls < pendingPullLimit; + return enough_space && schedule_limit; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + +ReadReturnStatus +CoalesceEngine::recvWLRead(Addr addr) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + assert(aligned_addr % peerMemoryAtomSize == 0); + int block_index = getBlockIndex(aligned_addr); + assert(block_index < numLines); + if (lastReadTick < curTick()) { + blocksTouchedThisTick.clear(); + lastReadTick = curTick(); + } + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].valid)) { + // Hit + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(cacheBlocks[block_index].state != CacheState::INVALID); + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset], curTick())); + + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].state = CacheState::BUSY; + // HACK: If a read happens on the same cycle as another operation such + // as apply set lastChangedTick to half a cycle later so that operation + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + blocksTouchedThisTick.insert(block_index); + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); + } + + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].dirty); + + assert(MSHR.find(block_index) != MSHR.end()); + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + blocksTouchedThisTick.insert(block_index); + + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else { + // miss + assert(cacheBlocks[block_index].addr != aligned_addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); + stats.readMisses++; + if (blocksTouchedThisTick.find(block_index) != blocksTouchedThisTick.end()) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has already been " + "accessed this tick.\n", __func__, block_index); + return ReadReturnStatus::REJECT_ROLL; + } + if (cacheBlocks[block_index].state != CacheState::INVALID) { + // conflict miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with " + "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); + cacheBlocks[block_index].hasConflict = true; + if (cacheBlocks[block_index].state == CacheState::IDLE) { + if (cacheBlocks[block_index].dirty) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is dirty.\n", + __func__, block_index); + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memAccBuffer.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is now " + "pending write back.\n", __func__, block_index); + } else { + // NOTE: The cache block could still be active but + // not dirty. If active we only have to active tracking + // but can throw the data away. + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not dirty.\n", + __func__, block_index); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n", + __func__, block_index); + numActiveBlocksNow.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + } + if (atom_active_future) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n", + __func__, block_index); + numActiveBlocksNext.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + // NOTE: Bring the cache line to invalid state. + // NOTE: Above line where we set hasConflict to true + // does not matter anymore since we reset the cache line. + cacheBlocks[block_index].reset(); + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is reset.\n", + __func__, block_index); + } + blocksTouchedThisTick.insert(block_index); + return ReadReturnStatus::REJECT_NO_ROLL; + } else { + blocksTouchedThisTick.insert(block_index); + stats.numConflicts++; + return ReadReturnStatus::REJECT_ROLL; + } + } else { + // cold miss + assert(MSHR.find(block_index) == MSHR.end()); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); + + MSHR[block_index].push_back(addr); + memAccBuffer.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + blocksTouchedThisTick.insert(block_index); + return ReadReturnStatus::ACCEPT; + } + } +} + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); + + onTheFlyReqs--; + if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); + delete pkt; + } else { + assert(pkt->isRead()); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + ReadPurpose* purpose = pkt->findNextSenderState(); + + // NOTE: Regardless of where the pkt will go we have to release the + // reserved space for this pkt in the activeBuffer in case + // it was read from memory for placement in the activeBuffer. + // NOTE: Also we have to stop tracking the address for pullAddrs + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + pendingPullReads--; + pendingPullAddrs.erase(addr); + } + if (cacheBlocks[block_index].addr == addr) { + // If it is in the cache, line should be in PENDING_DATA state. + // Regardless of the purpose for which it was read, it should + // be placed in the cache array. + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + // NOTE: Since it is in PENDING_DATA state it + // should have an entry in the MSHR. + assert(MSHR.find(block_index) != MSHR.end()); + + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + // HACK: In case the pkt was read for push but it was allocated + // for in the cache later on, we should cancel the future + // processNextRead for this block. We could set lastChangedTick + // to curTick() like usual. However, there is no way to ensure + // that processNextRead will be not be called on the same tick + // as the pkt arrives from the memory. Therefore, we will set + // the lastChangedTick to half a cycle before the actual time. + // We move that back in time because it would be fine if + // processNextRead happened before pkt arriveed. processNextRead + // actually will check if there is a pending read for push for + // the address it's trying to populate. + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + } else { + cacheBlocks[block_index].lastChangedTick = curTick(); + } + + // NOTE: If the atom is active we have to deactivate the tracking + // of this atom in the memory since it's not in memory anymore. + // Since it is going to the cache, cache will be responsible for + // tracking this. Push to activeCacheBlocks for simulator speed + // instead of having to search for active blocks in the cache. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + numActiveBlocksNow.push_back(block_index); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + } + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + numActiveBlocksNext.push_back(block_index); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + + assert(MSHR.find(block_index) != MSHR.end()); + for (auto it = MSHR[block_index].begin(); + it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + MSHR.erase(block_index); + + cacheBlocks[block_index].state = CacheState::BUSY; + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + delete pkt; + } else { + assert(purpose->dest() == ReadDestination::READ_FOR_PUSH); + // There should be enough room in activeBuffer to place this pkt. + // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space. + // So at this point in code we should have at least one free entry + // in the active buffer which is reserved for this pkt. + assert(activeBuffer.size() + pendingPullReads < activeBufferSize); + + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + atom_active_future |= items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + activeBuffer.emplace_back(pkt, curTick()); + } else { + stats.wastefulBytesRead += pkt->getSize(); + delete pkt; + } + + if (pullCondition()) { + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, -1, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + numScheduledPulls++; + } + } + delete purpose; + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return true; +} + +void +CoalesceEngine::processNextResponseEvent() +{ + int num_responses_sent = 0; + + Addr addr_response; + WorkListItem worklist_response; + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue." + " responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + // TODO: Add the condition to check that front of queue can be + // sent to WLEngine. i.e. it has at least been in the queue for + // one cycle. + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; + } + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + int block_index = getBlockIndex(aligned_addr); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, graphWorkload->printWorkListItem(wl), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); + + // NOTE: Design does not allow for write misses. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].busyMask != 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].state == CacheState::BUSY); + + // respective bit in busyMask for wl is set. + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == + (1 << wl_offset)); + + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].dirty |= true; + } + + bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].items[wl_offset] = wl; + if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) { + cacheBlocks[block_index].items[wl_offset].activeNow |= active; + if (active && (!numActiveBlocksNow.find(block_index))) { + numActiveBlocksNow.push_back(block_index); + if (!owner->running()) { + owner->start(); + } + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeFuture |= active; + if (active && (!numActiveBlocksNext.find(block_index))) { + numActiveBlocksNext.push_back(block_index); + } + } + + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, wl_offset, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (cacheBlocks[block_index].busyMask == 0) { + if (cacheBlocks[block_index].hasConflict) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memAccBuffer.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + numActiveBlocksNow.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + } + if (atom_active_future) { + numActiveBlocksNext.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + cacheBlocks[block_index].reset(); + } + } else { + cacheBlocks[block_index].state = CacheState::IDLE; + cacheBlocks[block_index].lastChangedTick = curTick(); + } + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexWrites++; + + if ((cacheBlocks[block_index].state == CacheState::IDLE) && + done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextMemoryEvent() +{ + int num_transitions = 0; + std::unordered_set transitions; + MemoryFunctionDeque temp_deque; + temp_deque.clear(); + + while (true) { + if (memPort.blocked()) { + while (!temp_deque.empty()) { + memAccBuffer.push_front(temp_deque.back()); + temp_deque.pop_back(); + } + stats.numMemoryBlocks++; + nextMemoryEvent.sleep(); + return; + } + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function function; + int input; + Tick tick; + std::tie(function, input, tick) = memAccBuffer.front(); + if ((transitions.find(input) == transitions.end()) || (input == -1)) { + function(input, tick); + memAccBuffer.pop_front(); + transitions.insert(input); + stats.memAccBufferLat.sample((curTick() - tick) * 1e9 / getClockFrequency()); + DPRINTF(CoalesceEngine, "%s: Popped a function from memAccBuffer. " + "memAccBuffer.size = %d.\n", __func__, memAccBuffer.size()); + num_transitions++; + } else { + temp_deque.emplace_back(function, input, tick); + memAccBuffer.pop_front(); + } + if ((num_transitions >= transitionsPerCycle) || memAccBuffer.empty()) { + break; + } + } + + while (!temp_deque.empty()) { + memAccBuffer.push_front(temp_deque.back()); + temp_deque.pop_back(); + } + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memAccBuffer.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + // TODO: Figure out if this is still necessary. + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } + + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + bool need_send_pkt = true; + + // NOTE: Search postPushWBQueue + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // NOTE: If an atom is in the postPushWBQueue, + // the it is definitely currently not active. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + numActiveBlocksNext.push_back(block_index); + } + + need_send_pkt = false; + wb = postPushWBQueue.erase(wb); + delete wb_pkt; + } else { + wb++; + } + } + // NOTE: Search activeBuffer + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) { + PacketPtr ab_pkt = std::get<0>(*ab); + if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) { + ab_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // If an atom is in the activeBuffer, + // then it is definitely currently active. + numActiveBlocksNow.push_back(block_index); + // NOTE: Residence in the activeBuffer does not + // signify anything about future activity. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + numActiveBlocksNext.push_back(block_index); + } + + need_send_pkt = false; + ab = activeBuffer.erase(ab); + delete ab_pkt; + if (pullCondition()) { + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, -1, curTick()); + numScheduledPulls++; + } + } else { + ab++; + } + } + if (!need_send_pkt) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + assert(MSHR[block_index].empty()); + MSHR.erase(block_index); + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + cacheBlocks[block_index].state = CacheState::BUSY; + } + + if (pendingPullAddrs.find(cacheBlocks[block_index].addr) != + pendingPullAddrs.end()) { + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE); + pkt->pushSenderState(purpose); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + } +} + +void +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); + + // NOTE: If the atom we're writing back is active, we have to + // stop tracking it in the cache and start tracking it in the memory. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + + PacketPtr pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + pkt->getAddr(), pkt->getSize()); + if (atom_active_future) { + numActiveBlocksNext.erase(block_index); + } + if (atom_active_now) { + numActiveBlocksNow.erase(block_index); + if (enoughSpace()) { + activeBuffer.emplace_back(pkt, curTick()); + } else { + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + } else { + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + cacheBlocks[block_index].reset(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); + } +} + +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + if (!postPushWBQueue.empty()) { + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + WorkListItem items[numElementsPerLine]; + wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_future |= items[index].activeFuture; + } + if (atom_active_future) { + futureDirectory->activate(wb_pkt->getAddr()); + } + memPort.sendPacket(wb_pkt); + onTheFlyReqs++; + postPushWBQueue.pop_front(); + } + } +} + +void +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__); + numScheduledPulls--; + if (!currentDirectory->empty()) { + Addr addr = currentDirectory->getNextWork(); + int block_index = getBlockIndex(addr); + + bool in_cache = cacheBlocks[block_index].addr == addr; + bool in_active_buffer = false; + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr pkt = std::get<0>(*ab); + in_active_buffer |= (pkt->getAddr() == addr); + } + bool in_write_buffer = false; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr pkt = std::get<0>(*wb); + in_write_buffer |= (pkt->getAddr() == addr); + } + bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end(); + + if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) { + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH); + pkt->pushSenderState(purpose); + memPort.sendPacket(pkt); + onTheFlyReqs++; + pendingPullReads++; + pendingPullAddrs.insert(addr); + } + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +int +CoalesceEngine::workCount() +{ + return numActiveBlocksNow.size() + currentDirectory->workCount() + activeBuffer.size(); +} + +void +CoalesceEngine::recvVertexPull() +{ + numReceivedPulls++; + DPRINTF(CoalesceEngine, "%s: Received a vertex pull. numReceivedPulls: %d.\n", __func__, numReceivedPulls); + + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; + if (!nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextApplyEvent() +{ + if ((!activeBuffer.empty()) && + (postPushWBQueue.size() < postPushWBQueueSize)) { + PacketPtr pkt; + Tick entrance_tick; + WorkListItem items[numElementsPerLine]; + + std::tie(pkt, entrance_tick) = activeBuffer.front(); + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + + for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) { + if (items[index].activeNow) { + Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(items[index]); + items[index].activeNow = false; + owner->recvVertexPush(addr, delta, items[index].edgeIndex, + items[index].degree); + numReceivedPulls--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + } + // NOTE: If the atom is not active anymore. + if (!atom_active_now) { + PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), + peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + activeBuffer.pop_front(); + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, -1, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + delete pkt; + } + } else if (!numActiveBlocksNow.empty()) { + int num_visited_indices = 0; + int initial_fifo_length = numActiveBlocksNow.size(); + while (true) { + int block_index = numActiveBlocksNow.front(); + if (cacheBlocks[block_index].state == CacheState::IDLE) { + for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) { + if (cacheBlocks[block_index].items[index].activeNow) { + Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].items[index].activeNow = false; + cacheBlocks[block_index].dirty = true; + owner->recvVertexPush(addr, delta, + cacheBlocks[block_index].items[index].edgeIndex, + cacheBlocks[block_index].items[index].degree); + numReceivedPulls--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + } + // NOTE: If we have reached the last item in the cache block + if (!atom_active_now) { + numActiveBlocksNow.erase(block_index); + } + break; + } + // NOTE: If the block with index at the front of activeCacheBlocks + // is not in IDLE state, then roll the that index to the back + numActiveBlocksNow.pop_front(); + numActiveBlocksNow.push_back(block_index); + // NOTE: If we have visited all the items initially in the FIFO. + num_visited_indices++; + if (num_visited_indices == initial_fifo_length) { + break; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__); + stats.worklessCycles++; + } + + if (pullCondition()) { + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, -1, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + numScheduledPulls++; + } + + if ((numReceivedPulls > 0) && (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce): + statistics::Group(&_coalesce), coalesce(_coalesce), lastResetTick(0), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), + ADD_STAT(numConflicts, statistics::units::Count::get(), + "Number of conflicts raised by reads in the cache."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), + ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(), + "Number of bytes read that were not used by coalesce engine"), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(worklessCycles, statistics::units::Count::get(), + "cycles the coalesce engine could not find work for apply"), + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), + ADD_STAT(currentFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the current bitvector."), + ADD_STAT(futureFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the future bitvector."), + ADD_STAT(countActiveBlocksNow, statistics::units::Count::get(), + "Histogram of the popCount values in the current directory"), + ADD_STAT(countActiveBlocksNext, statistics::units::Count::get(), + "Histogram of the popCount values in the future directory"), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), + ADD_STAT(memAccBufferLat, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + currentFrontierSize.init(64); + futureFrontierSize.init(64); + countActiveBlocksNow.init(64); + countActiveBlocksNext.init(64); + responseQueueLatency.init(64); + memAccBufferLat.init(64); +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh new file mode 100644 index 0000000000..3a9e463595 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/work_directory.hh" +#include "base/cprintf.hh" +#include "base/statistics.hh" +#include "params/CoalesceEngine.hh" + +namespace gem5 +{ + +typedef std::deque, int, Tick>> MemoryFunctionDeque; + +class MPU; + +class CoalesceEngine : public BaseMemoryEngine +{ + private: + struct Block + { + WorkListItem* items; + Addr addr; + uint64_t busyMask; + bool valid; + bool dirty; + bool hasConflict; + CacheState state; + Tick lastChangedTick; + Block() {} + Block(int num_elements): + addr(-1), + busyMask(0), + valid(false), + dirty(false), + hasConflict(false), + state(CacheState::INVALID), + lastChangedTick(0) + { + items = new WorkListItem [num_elements]; + } + + void reset() { + addr = -1; + busyMask = 0; + valid = false; + dirty = false; + hasConflict = false; + state = CacheState::INVALID; + lastChangedTick = 0; + } + + std::string to_string() { + return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " + "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}", + addr, busyMask, valid ? "true" : "false", + dirty ? "true" : "false", hasConflict ? "true" : "false", + cacheStateStrings[state], lastChangedTick); + } + }; + + struct ReadPurpose : public Packet::SenderState + { + ReadDestination _dest; + ReadPurpose(ReadDestination dest): _dest(dest) {} + ReadDestination dest() { return _dest; } + }; + + MPU* owner; + ProcessingMode mode; + WorkDirectory* currentDirectory; + WorkDirectory* futureDirectory; + GraphWorkload* graphWorkload; + + Addr lastAtomAddr; + + int numLines; + int numElementsPerLine; + Block* cacheBlocks; + + Tick lastReadTick; + std::unordered_set blocksTouchedThisTick; + + int onTheFlyReqs; + std::unordered_map> MSHR; + + // Response route to WLEngine + int maxRespPerCycle; + std::deque> responseQueue; + + // Tracking work in cache + int numReceivedPulls; + // NOTE: Remember to erase from these upon eviction from cache + UniqueFIFO numActiveBlocksNow; + UniqueFIFO numActiveBlocksNext; + + int numScheduledPulls; + int pendingPullLimit; + int pendingPullReads; + // A map from addr to sendMask. sendMask determines which bytes to + // send for push when getting the read response from memory. + std::unordered_set pendingPullAddrs; + + int activeBufferSize; + int postPushWBQueueSize; + std::deque> activeBuffer; + std::deque> postPushWBQueue; + + bool enoughSpace(); + bool pullCondition(); + int getBlockIndex(Addr addr); + + int transitionsPerCycle; + MemoryFunctionDeque memAccBuffer; + + MemoryEvent nextMemoryEvent; + void processNextMemoryEvent(); + void processNextRead(int block_index, Tick schedule_tick); + void processNextWriteBack(int block_index, Tick schedule_tick); + void processNextVertexPull(int ignore, Tick schedule_tick); + void processNextPostPushWB(int ignore, Tick schedule_tick); + + EventFunctionWrapper nextResponseEvent; + void processNextResponseEvent(); + + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + + struct CoalesceStats : public statistics::Group + { + CoalesceStats(CoalesceEngine& coalesce); + + virtual void regStats() override; + + virtual void resetStats() override; + + CoalesceEngine &coalesce; + + Tick lastResetTick; + + statistics::Scalar numVertexReads; + statistics::Scalar numVertexWrites; + statistics::Scalar readHits; + statistics::Scalar readMisses; + statistics::Scalar readHitUnderMisses; + statistics::Scalar numConflicts; + statistics::Scalar responsePortShortage; + statistics::Scalar numMemoryBlocks; + statistics::Scalar wastefulBytesRead; + statistics::Scalar verticesPulled; + statistics::Scalar verticesPushed; + statistics::Scalar lastVertexPullTime; + statistics::Scalar lastVertexPushTime; + statistics::Scalar worklessCycles; + + statistics::Formula hitRate; + statistics::Formula vertexPullBW; + statistics::Formula vertexPushBW; + + statistics::Histogram currentFrontierSize; + statistics::Histogram futureFrontierSize; + statistics::Histogram countActiveBlocksNow; + statistics::Histogram countActiveBlocksNext; + statistics::Histogram responseQueueLatency; + statistics::Histogram memAccBufferLat; + }; + + CoalesceStats stats; + + protected: + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; + + public: + PARAMS(CoalesceEngine); + CoalesceEngine(const Params ¶ms); + void registerMPU(MPU* mpu); + + void setProcessingMode(ProcessingMode _mode) { mode = _mode; } + void createAsyncPopCountDirectory(int atoms_per_block); + void createBSPPopCountDirectory(int atoms_per_block); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + + virtual void recvFunctional(PacketPtr pkt) override; + void postMemInitSetup(); + void postConsumeProcess(); + void swapDirectories(); + + ReadReturnStatus recvWLRead(Addr addr); + void recvWLWrite(Addr addr, WorkListItem wl); + + int workCount(); + int futureWorkCount(); + void recvVertexPull(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc new file mode 100644 index 0000000000..ba57b387f4 --- /dev/null +++ b/src/accl/graph/sega/enums.cc @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/enums.hh" + +namespace gem5 +{ + +const char* registerStateStrings[NUM_REGISTER_STATE] = { + "PENDING_READ", + "PENDING_REDUCE", + "PENDING_WRITE" +}; + +const char* cacheStateStrings[NUM_CACHE_STATE] = { + "INVALID", + "PENDING_DATA", + "BUSY", + "IDLE", + "PENDING_WB" +}; + +const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] = +{ + "ACCEPT", + "REJECT_ROLL", + "REJECT_NO_ROLL" +}; + +const char* readDestinationStrings[NUM_READ_DESTINATION] = +{ + "READ_FOR_CACHE", + "READ_FOR_PUSH" +}; + +} // namespace gem5 diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh new file mode 100644 index 0000000000..0f654c5386 --- /dev/null +++ b/src/accl/graph/sega/enums.hh @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__ +#define __ACCL_GRAPH_SEGA_ENUMS_HH__ + +namespace gem5 +{ + +enum RegisterState +{ + PENDING_READ, + PENDING_REDUCE, + PENDING_WRITE, + NUM_REGISTER_STATE +}; +extern const char* registerStateStrings[NUM_REGISTER_STATE]; + +enum CacheState +{ + INVALID, + PENDING_DATA, + BUSY, + IDLE, + PENDING_WB, + NUM_CACHE_STATE +}; +extern const char* cacheStateStrings[NUM_CACHE_STATE]; + +enum ReadReturnStatus +{ + ACCEPT, + REJECT_ROLL, + REJECT_NO_ROLL, + NUM_READ_RETURN_STATUS +}; +extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS]; + +enum ReadDestination +{ + READ_FOR_CACHE, + READ_FOR_PUSH, + NUM_READ_DESTINATION +}; +extern const char* readDestinationStrings[NUM_READ_DESTINATION]; + +enum ProcessingMode +{ + NOT_SET, + ASYNCHRONOUS, + BULK_SYNCHRONOUS, + POLY_GRAPH, + NUM_PROCESSING_MODE +}; +extern const char* processingModeStrings[NUM_PROCESSING_MODE]; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__ diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc new file mode 100644 index 0000000000..a5063cf685 --- /dev/null +++ b/src/accl/graph/sega/mpu.cc @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/mpu.hh" + +#include "accl/graph/sega/centeral_controller.hh" +#include "debug/MPU.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +MPU::MPU(const Params& params): + SimObject(params), + system(params.system), + wlEngine(params.wl_engine), + coalesceEngine(params.coalesce_engine), + pushEngine(params.push_engine) +{ + wlEngine->registerMPU(this); + coalesceEngine->registerMPU(this); + pushEngine->registerMPU(this); +} + +void +MPU::registerCenteralController(CenteralController* centeral_controller) +{ + centeralController = centeral_controller; +} + +bool +MPU::handleIncomingUpdate(PacketPtr pkt) +{ + return wlEngine->handleIncomingUpdate(pkt); +} + +void +MPU::handleIncomingWL(Addr addr, WorkListItem wl) +{ + wlEngine->handleIncomingWL(addr, wl); +} + +void +MPU::recvWLWrite(Addr addr, WorkListItem wl) +{ + coalesceEngine->recvWLWrite(addr, wl); +} + +void +MPU::recvWorkload(GraphWorkload* workload) +{ + coalesceEngine->recvWorkload(workload); + pushEngine->recvWorkload(workload); + wlEngine->recvWorkload(workload); +} + +void +MPU::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + pushEngine->recvVertexPush(addr, delta, edge_index, degree); +} + +void +MPU::recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + pushEngine->recvMirrorPush(addr, delta, edge_index, degree); +} + +void +MPU::recvDoneSignal() +{ + if (done()) { + centeralController->recvDoneSignal(); + } +} + +bool +MPU::done() +{ + return wlEngine->done() && coalesceEngine->done() && pushEngine->done(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh new file mode 100644 index 0000000000..4afb2081ca --- /dev/null +++ b/src/accl/graph/sega/mpu.hh @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_MPU_HH__ +#define __ACCL_GRAPH_SEGA_MPU_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/coalesce_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/wl_engine.hh" +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "sim/sim_object.hh" +#include "sim/system.hh" +#include "params/MPU.hh" + +namespace gem5 +{ + +class CenteralController; + +class MPU : public SimObject +{ + private: + System* system; + CenteralController* centeralController; + + WLEngine* wlEngine; + CoalesceEngine* coalesceEngine; + PushEngine* pushEngine; + + public: + PARAMS(MPU); + MPU(const Params& params); + void registerCenteralController(CenteralController* centeral_controller); + + void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); } + void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); } + void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); } + + unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; } + AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } + uint64_t getCacheSize() { return coalesceEngine->params().cache_size; } + void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } + void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } + void postConsumeProcess() { coalesceEngine->postConsumeProcess(); } + void swapDirectories() { coalesceEngine->swapDirectories(); } + + bool handleIncomingUpdate(PacketPtr pkt); + + void handleIncomingWL(Addr addr, WorkListItem wl); + ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } + void recvWLWrite(Addr addr, WorkListItem wl); + void recvWorkload(GraphWorkload* Workload); + + int workCount() { return coalesceEngine->workCount(); } + void recvVertexPull() { return coalesceEngine->recvVertexPull(); } + bool running() { return pushEngine->running(); } + void start() { return pushEngine->start(); } + void recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + + void recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + void startProcessingMirrors(Tick time_to_wait) { pushEngine->startProcessingMirrors(time_to_wait); } + + void recvDoneSignal(); + bool done(); +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_MPU_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc new file mode 100644 index 0000000000..6040989070 --- /dev/null +++ b/src/accl/graph/sega/push_engine.cc @@ -0,0 +1,567 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/push_engine.hh" + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/PushEngine.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +PushEngine::PushEngine(const Params& params): + BaseMemoryEngine(params), + _running(false), + lastIdleEntranceTick(0), + numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), + onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), + examineWindow(params.examine_window), + maxPropagatesPerCycle(params.max_propagates_per_cycle), + updateQueueSize(params.update_queue_size), + nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), + nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), + nextPropagateEvent([this] { processNextPropagateEvent(); }, name()), + nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()), + stats(*this) +{ + destinationQueues.clear(); + for (int i = 0; i < params.port_out_ports_connection_count; ++i) { + outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i); + destinationQueues.emplace_back(); + destinationQueues[i].clear(); + sourceAndValueMaps.emplace_back(); + sourceAndValueMaps[i].clear(); + } +} + +Port& +PushEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "out_ports") { + return outPorts[idx]; + } else if (if_name == "mem_port") { + return BaseMemoryEngine::getPort(if_name, idx); + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +PushEngine::init() +{ + localAddrRange = owner->getAddrRanges(); + for (int i = 0; i < outPorts.size(); i++){ + AddrRangeList range_list = outPorts[i].getAddrRanges(); + for (auto range: range_list) { + portAddrMap.insert(range, i); + } + } +} + +void +PushEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +void +PushEngine::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__); + blockedPacket = pkt; + } +} + +bool +PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::ReqPort::recvReqRetry() +{ + panic_if(blockedPacket == nullptr, + "Received retry without a blockedPacket."); + + DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. " + "blockedPacket: %s.\n", __func__, _id, blockedPacket->print()); + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); + if (blockedPacket == nullptr) { + DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__); + owner->recvReqRetry(); + } +} + +void +PushEngine::recvReqRetry() +{ + DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + +bool +PushEngine::vertexSpace() +{ + return (edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize); +} + +bool +PushEngine::workLeft() +{ + return ((owner->workCount() - numPendingPulls) > 0); +} + +bool +PushEngine::done() +{ + bool empty_update_queues = true; + for (int i = 0; i < outPorts.size(); i++) { + empty_update_queues &= destinationQueues[i].empty(); + } + return empty_update_queues && metaEdgeQueue.empty() && + (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); +} + +void +PushEngine::start() +{ + assert(!_running); + // assert(!nextVertexPullEvent.scheduled()); + + _running = true; + // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); + // NOTE: We might have to check for size availability here. + assert(workLeft()); + if (vertexSpace() && !nextVertexPullEvent.scheduled()) { + schedule(nextVertexPullEvent, nextCycle()); + } +} + +void +PushEngine::processNextVertexPullEvent() +{ + if (workLeft()) { + numPendingPulls++; + owner->recvVertexPull(); + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + } else { + _running = false; + lastIdleEntranceTick = curTick(); + DPRINTF(PushEngine, "%s: In idle state now.\n", __func__); + } +} + +void +PushEngine::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + assert(degree > 0); + assert((edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize)); + + Addr start_addr = edge_index * sizeof(Edge); + Addr end_addr = start_addr + (degree * sizeof(Edge)); + EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr, + sizeof(Edge), peerMemoryAtomSize); + + edgePointerQueue.emplace_back(info_gen, curTick()); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); + numPendingPulls--; + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())) { + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +void +PushEngine::recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + Addr start_addr = edge_index * sizeof(Edge); + Addr end_addr = start_addr + (degree * sizeof(Edge)); + EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr, + sizeof(Edge), peerMemoryAtomSize); + + edgePointerQueue.emplace_back(info_gen, curTick()); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); +} + +void +PushEngine::startProcessingMirrors(Tick time_to_wait) +{ + assert(!nextMemoryReadEvent.pending()); + assert(!nextMemoryReadEvent.scheduled()); + Cycles wait = ticksToCycles(time_to_wait); + if (!edgePointerQueue.empty()) { + schedule(nextMemoryReadEvent, clockEdge(wait)); + } +} + +void +PushEngine::processNextMemoryReadEvent() +{ + if (memPort.blocked()) { + nextMemoryReadEvent.sleep(); + return; + } + Addr aligned_addr, offset; + int num_edges; + + EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front()); + Tick entrance_tick = std::get<1>(edgePointerQueue.front()); + std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) + { + DPRINTF(PushEngine, "%s: Current packet information generated by " + "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " + "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); + + PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); + PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges}; + reqInfoMap[pkt->req] = push_info; + memPort.sendPacket(pkt); + onTheFlyMemReqs += num_edges; + + curr_info.iterate(); + if (curr_info.done()) { + DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); + stats.edgePointerQueueLatency.sample( + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + edgePointerQueue.pop_front(); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); + DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " + "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); + } + } + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if (!edgePointerQueue.empty()) { + assert(!nextMemoryReadEvent.pending()); + assert(!nextMemoryReadEvent.scheduled()); + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +void +PushEngine::recvMemRetry() +{ + if (nextMemoryReadEvent.pending()) { + DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__); + nextMemoryReadEvent.wake(); + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +bool +PushEngine::handleMemResp(PacketPtr pkt) +{ + // TODO: in case we need to edit edges, get rid of second statement. + assert(pkt->isResponse() && (!pkt->isWrite())); + + uint8_t pkt_data [peerMemoryAtomSize]; + PushInfo push_info = reqInfoMap[pkt->req]; + pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); + + for (int i = 0; i < push_info.numElements; i++) { + Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); + Addr edge_dst = edge->neighbor; + uint32_t edge_weight = edge->weight; + MetaEdge meta_edge( + push_info.src, edge_dst, edge_weight, push_info.value); + metaEdgeQueue.emplace_back(meta_edge, curTick()); + stats.edgeQueueLength.sample(metaEdgeQueue.size()); + } + stats.edgeQueueLength.sample(metaEdgeQueue.size()); + stats.numWastefulEdgesRead += + (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements; + + onTheFlyMemReqs -= push_info.numElements; + reqInfoMap.erase(pkt->req); + + delete pkt; + + if (!nextPropagateEvent.scheduled()) { + schedule(nextPropagateEvent, nextCycle()); + } + return true; +} + +void +PushEngine::processNextPropagateEvent() +{ + int num_propagates = 0; + int num_tries = 0; + int num_reads = 0; + std::deque> temp_edge; + for (int i = 0; i < examineWindow; i++) { + if (metaEdgeQueue.empty()) { + break; + } + temp_edge.push_back(metaEdgeQueue.front()); + metaEdgeQueue.pop_front(); + } + int max_visits = temp_edge.size(); + + while(true) { + MetaEdge meta_edge; + Tick entrance_tick; + std::tie(meta_edge, entrance_tick) = temp_edge.front(); + + DPRINTF(PushEngine, "%s: The edge to process is %s.\n", + __func__, meta_edge.to_string()); + + uint32_t update_value = + graphWorkload->propagate(meta_edge.value, meta_edge.weight); + temp_edge.pop_front(); + num_tries++; + + if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) { + DPRINTF(PushEngine, "%s: Sent %s to port queues.\n", + __func__, meta_edge.to_string()); + num_reads++; + stats.numPropagates++; + stats.edgeQueueLatency.sample( + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + } else { + temp_edge.emplace_back(meta_edge, entrance_tick); + stats.updateQueueFull++; + } + num_propagates++; + + if (temp_edge.empty()) { + break; + } + if (num_tries >= max_visits) { + break; + } + } + + while (!temp_edge.empty()) { + metaEdgeQueue.push_front(temp_edge.back()); + temp_edge.pop_back(); + } + + stats.numPropagatesHist.sample(num_propagates); + + assert(!nextPropagateEvent.scheduled()); + if (!metaEdgeQueue.empty()) { + schedule(nextPropagateEvent, nextCycle()); + } +} + +bool +PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value) +{ + Addr aligned_dst = roundDown(dst, owner->vertexAtomSize()); + AddrRange update_range(aligned_dst, aligned_dst + owner->vertexAtomSize()); + auto entry = portAddrMap.contains(update_range); + PortID port_id = entry->second; + + DPRINTF(PushEngine, "%s: Update{src: %lu, dst:%lu, value: %u} " + "belongs to port %d.\n", + __func__, src, dst, value, port_id); + DPRINTF(PushEngine, "%s: There are %d updates already " + "in queue for port %d.\n", __func__, + destinationQueues[port_id].size(), port_id); + + assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size()); + + int num_updates = 0; + for (auto queue: destinationQueues) { + num_updates += queue.size(); + } + + if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) { + DPRINTF(PushEngine, "%s: Found an existing update " + "for dst: %lu.\n", __func__, dst); + Addr prev_src; + uint32_t prev_val; + std::tie(prev_src, prev_val) = sourceAndValueMaps[port_id][dst]; + uint32_t new_val = graphWorkload->reduce(value, prev_val); + sourceAndValueMaps[port_id][dst] = std::make_tuple(prev_src, new_val); + DPRINTF(PushEngine, "%s: Coalesced Update{src: %lu, dst:%lu, value: %u} " + "with Update{src: %lu, dst:%lu, value: %u} to" + "Update{src: %lu, dst:%lu, value: %u}.\n", __func__, + src, dst, value, prev_src, dst, prev_val, + prev_src, dst, new_val); + stats.updateQueueCoalescions++; + return true; + } else if (num_updates < (updateQueueSize * destinationQueues.size())) { + DPRINTF(PushEngine, "%s: There is a free entry available " + "in queue for port %d.\n", __func__, port_id); + destinationQueues[port_id].emplace_back(dst, curTick()); + sourceAndValueMaps[port_id][dst] = std::make_tuple(src, value); + DPRINTF(PushEngine, "%s: Emplaced Update{src: %lu, dst:%lu, value: %u} " + "at the back of queue for port %d. " + "Size of queue for port %d is %d.\n", __func__, + src, dst, value, port_id, port_id, + destinationQueues[port_id].size()); + stats.updateQueueLength.sample(destinationQueues[port_id].size()); + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } + return true; + } + DPRINTF(PushEngine, "%s: DestinationQueue for pot %d is blocked.\n", + __func__, port_id); + return false; +} + +template PacketPtr +PushEngine::createUpdatePacket(Addr addr, T value) +{ + RequestPtr req = std::make_shared(addr, sizeof(T), 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 1) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + + pkt->allocate(); + // pkt->setData(data); + pkt->setLE(value); + + return pkt; +} + +void +PushEngine::processNextUpdatePushEvent() +{ + int next_time_send = 0; + + for (int i = 0; i < outPorts.size(); i++) { + if (outPorts[i].blocked()) { + DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, i); + continue; + } + DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, i); + if (destinationQueues[i].empty()) { + DPRINTF(PushEngine, "%s: Respective queue for " + "port %d is empty.\n", __func__, i); + continue; + } + Addr dst; + Tick entrance_tick; + std::tie(dst, entrance_tick) = destinationQueues[i].front(); + Addr src; + uint32_t value; + std::tie(src, value) = sourceAndValueMaps[i][dst]; + + PacketPtr pkt = createUpdatePacket(dst, value); + outPorts[i].sendPacket(pkt); + destinationQueues[i].pop_front(); + sourceAndValueMaps[i].erase(dst); + DPRINTF(PushEngine, "%s: Sent Update{src: %lu, dst:%lu, value: %u} to " + "port %d. Respective queue size is %d.\n", __func__, + src, dst, value, i, destinationQueues[i].size()); + if (destinationQueues[i].size() > 0) { + next_time_send += 1; + } + stats.numUpdates++; + } + + assert(!nextUpdatePushEvent.scheduled()); + if (next_time_send > 0) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + +PushEngine::PushStats::PushStats(PushEngine& _push): + statistics::Group(&_push), push(_push), + ADD_STAT(numPropagates, statistics::units::Count::get(), + "Number of propagate operations done."), + ADD_STAT(updateQueueFull, statistics::units::Count::get(), + "Number of times the update queue returns false."), + ADD_STAT(numNetBlocks, statistics::units::Count::get(), + "Number of updates blocked by network."), + // ADD_STAT(numIdleCycles, statistics::units::Count::get(), + // "Number of cycles PushEngine has been idle."), + ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), + "Number of coalescions in the update queues."), + ADD_STAT(numUpdates, statistics::units::Count::get(), + "Number of updates sent to the network."), + ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(), + "Number of wasteful edges read from edge memory."), + ADD_STAT(TEPS, statistics::units::Rate::get(), + "Traversed Edges Per Second."), + ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the edgePointerQueue."), + ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(), + "Histogram of the size of the edgePointerQueue."), + ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the metaEdgeQueue."), + ADD_STAT(edgeQueueLength, statistics::units::Count::get(), + "Histogram of the size of the metaEdgeQueue."), + ADD_STAT(updateQueueLength, statistics::units::Count::get(), + "Histogram of the length of updateQueues."), + ADD_STAT(numPropagatesHist, statistics::units::Count::get(), + "Histogram of number of propagates sent.") +{ +} + +void +PushEngine::PushStats::regStats() +{ + using namespace statistics; + + TEPS = numPropagates / simSeconds; + + edgePointerQueueLatency.init(64); + edgePointerQueueLength.init(64); + edgeQueueLatency.init(64); + edgeQueueLength.init(64); + updateQueueLength.init(64); + numPropagatesHist.init(1 + push.params().max_propagates_per_cycle); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh new file mode 100644 index 0000000000..7170d2d22e --- /dev/null +++ b/src/accl/graph/sega/push_engine.hh @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "base/addr_range_map.hh" +#include "base/intmath.hh" +#include "params/PushEngine.hh" + +namespace gem5 +{ + +class MPU; + +class PushEngine : public BaseMemoryEngine +{ + private: + class ReqPort : public RequestPort + { + private: + PushEngine* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + ReqPort(const std::string& name, PushEngine* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + class EdgeReadInfoGen { + private: + Addr _src; + uint32_t _delta; + + Addr _start; + Addr _end; + size_t _step; + size_t _atom; + + public: + EdgeReadInfoGen(Addr src, uint32_t delta, Addr start, + Addr end, size_t step, size_t atom): + _src(src), _delta(delta), _start(start), + _end(end), _step(step), _atom(atom) + {} + + Addr src() { return _src; } + uint32_t delta() { return _delta; } + + std::tuple nextReadPacketInfo() + { + panic_if(done(), "Should not call nextPacketInfo when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + Addr offset = _start - aligned_addr; + int num_items = 0; + + if (_end > (aligned_addr + _atom)) { + num_items = (_atom - offset) / _step; + } else { + num_items = (_end - _start) / _step; + } + + return std::make_tuple(aligned_addr, offset, num_items); + } + + void iterate() + { + panic_if(done(), "Should not call iterate when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + _start = aligned_addr + _atom; + } + + bool done() { return (_start >= _end); } + }; + + struct PushInfo { + Addr src; + uint32_t value; + Addr offset; + int numElements; + }; + + MPU* owner; + GraphWorkload* graphWorkload; + + bool _running; + Tick lastIdleEntranceTick; + + AddrRangeList localAddrRange; + + int numPendingPulls; + int edgePointerQueueSize; + std::deque> edgePointerQueue; + std::unordered_map reqInfoMap; + + int onTheFlyMemReqs; + int edgeQueueSize; + int examineWindow; + int maxPropagatesPerCycle; + std::deque> metaEdgeQueue; + + int updateQueueSize; + template PacketPtr createUpdatePacket(Addr addr, T value); + bool enqueueUpdate(Addr src, Addr dst, uint32_t value); + std::vector>> destinationQueues; + std::vector>> sourceAndValueMaps; + AddrRangeMap portAddrMap; + std::vector outPorts; + + bool vertexSpace(); + bool workLeft(); + + EventFunctionWrapper nextVertexPullEvent; + void processNextVertexPullEvent(); + + MemoryEvent nextMemoryReadEvent; + void processNextMemoryReadEvent(); + + EventFunctionWrapper nextPropagateEvent; + void processNextPropagateEvent(); + + EventFunctionWrapper nextUpdatePushEvent; + void processNextUpdatePushEvent(); + + struct PushStats : public statistics::Group + { + PushStats(PushEngine& push); + + void regStats() override; + + PushEngine &push; + + statistics::Scalar numMemoryBlocks; + statistics::Scalar numPropagates; + statistics::Scalar updateQueueFull; + statistics::Scalar numNetBlocks; + statistics::Scalar updateQueueCoalescions; + statistics::Scalar numUpdates; + statistics::Scalar numWastefulEdgesRead; + + statistics::Formula TEPS; + + statistics::Histogram edgePointerQueueLatency; + statistics::Histogram edgePointerQueueLength; + statistics::Histogram edgeQueueLatency; + statistics::Histogram edgeQueueLength; + statistics::Histogram updateQueueLength; + statistics::Histogram numPropagatesHist; + }; + + PushStats stats; + + protected: + virtual void recvMemRetry(); + virtual bool handleMemResp(PacketPtr pkt); + + public: + PARAMS(PushEngine); + PushEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerMPU(MPU* mpu); + + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + + void start(); + bool running() { return _running; } + void recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + void recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + void startProcessingMirrors(Tick time_to_wait); + + void recvReqRetry(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc new file mode 100644 index 0000000000..e26cc06645 --- /dev/null +++ b/src/accl/graph/sega/router_engine.cc @@ -0,0 +1,750 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/router_engine.hh" + +#include "accl/graph/sega/centeral_controller.hh" +#include "base/trace.hh" +#include "sim/stats.hh" +#include "debug/RouterEngine.hh" + +namespace gem5 +{ +RouterEngine::RouterEngine(const Params ¶ms): + ClockedObject(params), + system(params.system), + gptQSize(params.gpt_queue_size), + gpnQSize(params.gpn_queue_size), + emptyQueues(false), + routerLatency(params.router_latency), + start(0), + sampleTime(params.sample_time), + tokens(params.token), + nextGPTGPNEvent([this] { processNextGPTGPNEvent(); }, name()), + nextInternalRequestEvent( + [this] { processNextInternalRequestEvent(); }, name()), + nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()), + nextExternalRequestEvent( + [this] { processNextExternalRequestEvent(); }, name()), + nextTrafficTrackEvent( + [this] { processNextTrafficTrackEvent(); }, name()), + stats(*this) +{ + + for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) { + gptReqPorts.emplace_back( + name() + ".gpt_req_side" + std::to_string(i), this, i); + // m_newTraffic.emplace_back(new statistics::Histogram()); + // m_newTraffic[i]->init(10); + } + for (int i = 0; i < params.port_gpt_resp_side_connection_count; ++i) { + gptRespPorts.emplace_back( + name() + ".gpt_resp_side" + std::to_string(i), this, i); + } + for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) { + gpnReqPorts.emplace_back( + name() + ".gpn_req_side" + std::to_string(i), this, i); + } + for (int i = 0; i < params.port_gpn_resp_side_connection_count; ++i) { + gpnRespPorts.emplace_back( + name() + ".gpn_resp_side" + std::to_string(i), this, i); + } + for (int i = 0; i getGPNRanges(); +} + +AddrRangeList +RouterEngine::GPNRespPort::getAddrRanges() const +{ + return owner->getGPTRanges(); +} + +AddrRangeList +RouterEngine::getGPNRanges() +{ + AddrRangeList ret; + for (auto &gpnPort : gpnReqPorts) { + for (auto &addr_range : gpnPort.getAddrRanges()) { + ret.push_back(addr_range); + } + } + // for(auto i = routerAddrMap.begin(); i != routerAddrMap.end(); ++i) { + // ret.push_back(i->second); + // } + return ret; +} + +AddrRangeList +RouterEngine::getGPTRanges() +{ + AddrRangeList ret; + for (auto &gptPort : gptReqPorts) { + for (auto &addr_range : gptPort.getAddrRanges()) { + ret.push_back(addr_range); + // std::cout<<"HERE:"<<&addr_range<print()); + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); + if (blockedPacket == nullptr) { + DPRINTF(RouterEngine, "%s: blockedPacket sent successfully.\n", + __func__); + owner->recvReqRetry(); + } +} + +bool +RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("Not implemented yet!"); + return 0; +} + +void +RouterEngine::GPNReqPort::recvReqRetry() +{ + // We should have a blocked packet if this function is called. + assert(blockedPacket != nullptr); + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + + sendPacket(pkt); + + owner->wakeUpInternal(); +} + +void +RouterEngine::GPNReqPort::sendPacket(PacketPtr pkt) { + panic_if(blocked(), "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) { + DPRINTF(RouterEngine, "%s: The GPNReq port is blocked.\n", __func__); + blockedPacket = pkt; + } +} + +void +RouterEngine::GPTReqPort::sendPacket(PacketPtr pkt) { + panic_if(blocked(), "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) { + DPRINTF(RouterEngine, "%s: The GPTReq port is blocked.\n", __func__); + blockedPacket = pkt; + } +} + +Tick +RouterEngine::GPTRespPort::recvAtomic(PacketPtr pkt) { + panic("Not implemented yet!"); +} + +void +RouterEngine::GPTRespPort::checkRetryReq() +{ + if (needSendRetryReq) { + needSendRetryReq = false; + sendRetryReq(); + } +} + +bool +RouterEngine::GPTRespPort::recvTimingReq(PacketPtr pkt) { + if (!owner->handleRequest(id(), pkt)) { + DPRINTF(RouterEngine, "%s: Router Rejected the packet %s.\n", + __func__, pkt->getAddr()); + needSendRetryReq = true; + return false; + } + return true; +} + +void +RouterEngine::recvReqRetry() +{ + DPRINTF(RouterEngine, "%s: Received a reqRetry.\n", __func__); + if (!nextExternalRequestEvent.scheduled()) { + schedule(nextExternalRequestEvent, nextCycle()); + } +} + +bool +RouterEngine::handleRequest(PortID portId, PacketPtr pkt) +{ + auto &queue = gptReqQueues[portId]; + bool accepted = false; + if (queue.size() < gptQSize) { + DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] size is: %d.\n", + __func__, portId, queue.size()); + gptReqQueues[portId].push(pkt); + accepted = true; + } else { + DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] is full.\n", + __func__, portId); + accepted = false; + } + + if (accepted && (!nextGPTGPNEvent.scheduled())) { + schedule(nextGPTGPNEvent, nextCycle()); + } + DPRINTF(RouterEngine, "%s: GPT sent req to router: accepted: %d.\n", + __func__, accepted); + return accepted; +} + +void +RouterEngine::processNextGPTGPNEvent() +{ + bool found = false; + bool queues_none_empty = false; + DPRINTF(RouterEngine, "%s: Trying to send a request from GPT to GPN.\n", + __func__); + for (auto &queue: gptReqQueues) { + if (!queue.second.empty()) { + PacketPtr pkt = queue.second.front(); + Addr pkt_addr = pkt->getAddr(); + for (int i = 0; i < gpnReqPorts.size(); i++) { + AddrRangeList addr_list = routerAddrMap[gpnReqPorts[i].id()]; + if ((contains(addr_list, pkt_addr))) { + if (gpnRespQueues[gpnReqPorts[i].id()].size() < gpnQSize) { + gpnRespQueues[gpnReqPorts[i].id()].push(pkt); + DPRINTF(RouterEngine, "%s: Pushing the pkt %s to " + "gpnRespQueue[%d]. gpnRespQueue size is: %d\n", + __func__, pkt->getAddr(), i, + gpnRespQueues[gpnReqPorts[i].id()].size()); + queue.second.pop(); + DPRINTF(RouterEngine, "%s: gptReqQueue size is: %d.\n", + __func__, queue.second.size()); + found |= true; + if ((!nextInternalRequestEvent.scheduled())) { + schedule(nextInternalRequestEvent, nextCycle()); + } + // queue is full + } else { + DPRINTF(RouterEngine, "%s: gpnRespQueue[%d] is full." + "\n", __func__, pkt->getAddr(), i); + found |= false; + } + } + } + } + if (found) { + checkGPTRetryReq(); + } + } + + for (auto &queue: gptReqQueues) + { + if (!queue.second.empty()) { + queues_none_empty = true; + } + } + + if (queues_none_empty) { + DPRINTF(RouterEngine, "%s: The gptReqQueues is not empty.\n", + __func__); + } else { + DPRINTF(RouterEngine, "%s: The gptReqQueues is empty.\n", __func__); + } + + if (queues_none_empty && (!nextGPTGPNEvent.scheduled())) { + schedule(nextGPTGPNEvent, nextCycle()); + } +} + +void +RouterEngine::processNextInternalRequestEvent() +{ + DPRINTF(RouterEngine, "%s: Sending a request between two routers.\n", + __func__); + bool none_empty_queue = false; + int id; + for (auto &queue: gpnRespQueues) { + if (!queue.second.empty()) { + id = gpnReqPorts[queue.first].id(); + if (!gpnReqPorts[queue.first].blocked() && (tokenVector[id] != 0)) { + if ((curCycle() - + internalLatency[gpnReqPorts[queue.first].id()]) + < routerLatency) { + continue; + } + PacketPtr pkt = queue.second.front(); + DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n", + __func__, pkt->getAddr(), gpnReqPorts[queue.first].id()); + gpnReqPorts[queue.first].sendPacket(pkt); + inFlightTraffic[queue.first]++; + queue.second.pop(); + internalLatency[gpnReqPorts[queue.first].id()] = curCycle(); + stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++; + stats.totalInternalTraffic[gpnReqPorts[queue.first].id()] += + sizeof(pkt); + tokenVector[id]--; + } else if (tokenVector[id] == 0) { + DPRINTF(RouterEngine, "%s: Rand out of tokens for port id %d.\n", + __func__, id); + stats.bandwidthBlocked[id]++; + } else { + DPRINTF(RouterEngine, "%s: port id %d is blocked.\n", + __func__, gpnReqPorts[queue.first].id()); + stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++; + } + } + } + + for (auto &queue: gpnRespQueues) { + if (!queue.second.empty()) { + none_empty_queue = true; + break; + } + } + + if (none_empty_queue) { + DPRINTF(RouterEngine, "%s: The gpnRespQueues is not empty.\n", + __func__); + } else { + DPRINTF(RouterEngine, "%s: The gpnRespQueues is empty.\n", __func__); + } + + Tick next_schedule = nextCycle() + cyclesToTicks(routerLatency); + for (auto itr = internalLatency.begin(); + itr != internalLatency.end(); + itr++) + { + if (cyclesToTicks(itr->second + routerLatency) < next_schedule) { + if ((itr->second + routerLatency) <= curCycle()) { + next_schedule = nextCycle(); + break; + } else { + next_schedule = std::min( + cyclesToTicks(itr->second + routerLatency), + next_schedule); + } + } + } + + + if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) { + schedule(nextInternalRequestEvent, next_schedule); + } + + if(!nextTrafficTrackEvent.scheduled() && (start == 0)) { + start = 1; + schedule(nextTrafficTrackEvent, next_schedule); + } +} + +void +RouterEngine::processNextTrafficTrackEvent() +{ + for (auto &queue: gpnRespQueues) { + stats.internalTrafficHist[queue.first]->sample(inFlightTraffic[queue.first]); + // stats.internalTrafficVector[queue.first][sample[queue.first]] = inFlightTraffic[queue.first]; + sample[queue.first]++; + inFlightTraffic[queue.first] = 0; + } + + for (int i = 0; i < gpnReqPorts.size(); i++) { + tokenVector[i] = tokens; + } + + if(!nextTrafficTrackEvent.scheduled()) { + schedule(nextTrafficTrackEvent, curTick() + sampleTime); + } +} + +void +RouterEngine::GPTRespPort::recvFunctional(PacketPtr pkt) { + panic("Not implemented yet!"); +} + +void +RouterEngine::GPTRespPort::recvRespRetry() { + panic("Not implemented yet!"); +} + +Tick +RouterEngine::GPNRespPort::recvAtomic(PacketPtr pkt) { + panic("Not implemented yet!"); +} + +void +RouterEngine::GPNRespPort::checkRetryReq() { + if (needSendRetryReq) { + needSendRetryReq = false; + sendRetryReq(); + } +} + +bool +RouterEngine::GPNRespPort::recvTimingReq(PacketPtr pkt) { + if (!owner->handleRemoteRequest(id(), pkt)) { + DPRINTF(RouterEngine, "%s: Router Rejected the packet %s.\n", + __func__, pkt->getAddr()); + needSendRetryReq = true; + return false; + } + return true; +} + +bool +RouterEngine::handleRemoteRequest(PortID id, PacketPtr pkt) { + bool accepted = false; + if (gpnReqQueues[id].size() < gpnQSize) { + gpnReqQueues[id].push(pkt); + accepted = true; + } else { + accepted = false; + } + + if (accepted && (!nextGPNGPTEvent.scheduled())) { + schedule(nextGPNGPTEvent, nextCycle()); + } + + DPRINTF(RouterEngine, "%s: The remote packet: %s is accepted: %d.\n", + __func__, pkt->getAddr(), accepted); + return accepted; +} + +void +RouterEngine::processNextGPNGPTEvent() +{ + bool found = false; + bool queues_none_empty = false; + for (auto &queue: gpnReqQueues) { + if (!queue.second.empty()) { + PacketPtr pkt = queue.second.front(); + Addr pkt_addr = pkt->getAddr(); + for (int i = 0; i < gptReqPorts.size(); i++) { + AddrRangeList addr_list = gptAddrMap[gptReqPorts[i].id()]; + if ((contains(addr_list, pkt_addr))) { + if (gptRespQueues[gptReqPorts[i].id()].size() < gptQSize) { + gptRespQueues[gptReqPorts[i].id()].push(pkt); + DPRINTF(RouterEngine, "%s: The size of " + "gptRespQueues[%d] is %d.\n", __func__, i, + gptRespQueues[gptReqPorts[i].id()].size()); + DPRINTF(RouterEngine, + "%s: Sending pkt %s to GPT %d.\n", + __func__, pkt->getAddr(), i); + queue.second.pop(); + found |= true; + if ((!nextExternalRequestEvent.scheduled())) { + schedule(nextExternalRequestEvent, nextCycle()); + } + } else { + DPRINTF(RouterEngine, + "%s: gptRespQueues[%d] is full.\n", + __func__, pkt->getAddr(), i); + found |= false; + } + } + } + } + if (found) { + checkGPNRetryReq(); + } + } + + for (auto &queue: gpnReqQueues) { + if (!queue.second.empty()) { + queues_none_empty = true; + } + } + + if (queues_none_empty) { + DPRINTF(RouterEngine, "%s: gpnReqQueues is not empty.\n", __func__); + } else { + DPRINTF(RouterEngine, "%s: gpnReqQueues is empty.\n", __func__); + } + + if (queues_none_empty && (!nextGPNGPTEvent.scheduled())) { + schedule(nextGPNGPTEvent, nextCycle()); + } +} + +void +RouterEngine::processNextExternalRequestEvent() +{ + DPRINTF(RouterEngine, "%s: Sending the request to the GPT.\n", __func__); + bool none_empty_queue = false; + for (auto &queue: gptRespQueues) { + if (!queue.second.empty()) { + if (!gptReqPorts[queue.first].blocked()) { + if ((curCycle() - + externalLatency[gptReqPorts[queue.first].id()]) + < routerLatency) { + continue; + } + stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++; + PacketPtr pkt = queue.second.front(); + DPRINTF(RouterEngine, "%s: gptRespQueues[%d] is not empty. " + "the size is: %d.\n", __func__, + gptReqPorts[queue.first].id() ,queue.second.size()); + DPRINTF(RouterEngine, "%s: Sending packet %s to GPT: %d.\n", + __func__, pkt->getAddr(),gptReqPorts[queue.first].id()); + gptReqPorts[queue.first].sendPacket(pkt); + queue.second.pop(); + externalLatency[gptReqPorts[queue.first].id()] = curCycle(); + } + else { + stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++; + } + } + } + + for (auto &queue: gptRespQueues) { + DPRINTF(RouterEngine, "%s: gptRespQueues[%d] size is: %d.\n", __func__, + gptReqPorts[queue.first].id() ,queue.second.size()); + if (!queue.second.empty()) { + none_empty_queue = true; + break; + } + } + + if (none_empty_queue) { + DPRINTF(RouterEngine, "%s: The gptRespQueues is not empty.\n", + __func__); + } else { + DPRINTF(RouterEngine, "%s: The gptRespQueues is empty.\n", __func__); + } + + Tick next_schedule = cyclesToTicks(curCycle() + routerLatency); + for (auto itr = externalLatency.begin(); + itr != externalLatency.end(); itr++) + { + if (cyclesToTicks(itr->second + routerLatency) < next_schedule) { + if ((itr->second + routerLatency) <= curCycle()) { + next_schedule = nextCycle(); + break; + } else { + next_schedule = std::min( + cyclesToTicks(itr->second + routerLatency), + next_schedule); + } + } + } + + if (none_empty_queue) { + if (!nextExternalRequestEvent.scheduled()) { + schedule(nextExternalRequestEvent, next_schedule); + } + } +} + +void +RouterEngine::GPNRespPort::recvFunctional(PacketPtr pkt) +{ + panic("Not implemented yet!"); +} + +void +RouterEngine::GPNRespPort::recvRespRetry() +{ + panic("Not implemented yet!"); +} + +void +RouterEngine::wakeUpInternal() +{ + if ((!nextInternalRequestEvent.scheduled())) { + for (auto &queue: gpnRespQueues) { + if (!queue.second.empty()) { + schedule(nextInternalRequestEvent, nextCycle()); + return; + } + } + } +} + +void +RouterEngine::checkGPTRetryReq() +{ + for (int i = 0; i < gptRespPorts.size(); i++) { + gptRespPorts[i].checkRetryReq(); + } +} + +void +RouterEngine::checkGPNRetryReq() +{ + for (int i = 0; i < gpnRespPorts.size(); i++) { + gpnRespPorts[i].checkRetryReq(); + } +} + +RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router) + : statistics::Group(&_router), + router(_router), + ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(), + "Number of packets blocked between routers."), + ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(), + "Number of external packets blocked."), + ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(), + "Number of packet passed between routers."), + ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(), + "Number of external packets passed."), + ADD_STAT(bandwidthBlocked, statistics::units::Count::get(), + "Number of packets blocked due to lack of."), + ADD_STAT(totalInternalTraffic, statistics::units::Count::get(), + "Total traffic sent from the internal port") + // , + // ADD_STAT(internalTrafficVector, statistics::units::Count::get(), + // "Number of requests sent in internal link") +{} + +void +RouterEngine::RouterEngineStat::regStats() +{ + using namespace statistics; + + internalBlockedTraffic.init(router.gpnReqPorts.size()); + externalBlockedTraffic.init(router.gptReqPorts.size()); + internalAcceptedTraffic.init(router.gpnReqPorts.size()); + externalAcceptedTraffic.init(router.gptReqPorts.size()); + bandwidthBlocked.init(router.gpnReqPorts.size()); + totalInternalTraffic.init(router.gpnReqPorts.size()); + // internalTrafficVector.init(router.gpnReqPorts.size(), 6000); + + for (uint32_t i = 0; i < router.gpnReqPorts.size(); ++i) { + internalTrafficHist.push_back(new statistics::Histogram(this)); + internalTrafficHist[i] + ->init(20000) + .name(csprintf("internal_traffic_hist_%i",i)) + .desc("") + .flags(nozero); + + internalPortBW.push_back(new statistics::Formula(this, + csprintf("average_internal_BW_%d", i).c_str(), + "Internal BW (GB/s)")); + + *internalPortBW[i] = + totalInternalTraffic[i] / (simSeconds*1e9); + } +} +}// namespace gem5 diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh new file mode 100644 index 0000000000..5c06ecc862 --- /dev/null +++ b/src/accl/graph/sega/router_engine.hh @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__ + +#include + +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/RouterEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" +#include "base/types.hh" + +namespace gem5 +{ +class CenteralController; +class RouterEngine : public ClockedObject +{ + private: + class GPTReqPort : public RequestPort + { + private: + RouterEngine* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + GPTReqPort(const std::string& name, RouterEngine* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + PortID id() { return _id; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + class GPNReqPort : public RequestPort + { + private: + RouterEngine* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + GPNReqPort(const std::string& name, RouterEngine* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + PortID id() { return _id; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + class GPTRespPort : public ResponsePort + { + private: + RouterEngine* owner; + bool needSendRetryReq; + PortID _id; + + public: + GPTRespPort(const std::string& name, RouterEngine* owner, PortID id): + ResponsePort(name, owner), + owner(owner), needSendRetryReq(false), _id(id) + {} + virtual AddrRangeList getAddrRanges() const; + + PortID id() { return _id; } + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + class GPNRespPort : public ResponsePort + { + private: + RouterEngine* owner; + bool needSendRetryReq; + PortID _id; + + public: + GPNRespPort(const std::string& name, RouterEngine* owner, PortID id): + ResponsePort(name, owner), + owner(owner), needSendRetryReq(false), _id(id) + {} + virtual AddrRangeList getAddrRanges() const; + + PortID id() { return _id; } + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + System* system; + CenteralController* centeralController; + bool handleRequest(PortID portId, PacketPtr pkt); + bool handleRemoteRequest(PortID portId, PacketPtr pkt); + void wakeUpInternal(); + void wakeUpExternal(); + void checkRetryExternal(); + void checkRetryInternal(); + std::vector gptReqPorts; + std::vector gptRespPorts; + + std::vector gpnReqPorts; + std::vector gpnRespPorts; + + + std::unordered_map gptAddrMap; + std::unordered_map routerAddrMap; + + std::unordered_map> gptReqQueues; + std::unordered_map> gpnRespQueues; + + std::unordered_map> gptRespQueues; + std::unordered_map> gpnReqQueues; + + std::unordered_map externalLatency; + std::unordered_map internalLatency; + std::vector inFlightTraffic; + std::vector tokenVector; + std::vector sample; + + const uint32_t gptQSize; + const uint32_t gpnQSize; + bool emptyQueues; + const Cycles routerLatency; + int start; + Tick sampleTime; + int tokens; + + EventFunctionWrapper nextGPTGPNEvent; + void processNextGPTGPNEvent(); + + EventFunctionWrapper nextInternalRequestEvent; + void processNextInternalRequestEvent(); + + EventFunctionWrapper nextGPNGPTEvent; + void processNextGPNGPTEvent(); + + EventFunctionWrapper nextExternalRequestEvent; + void processNextExternalRequestEvent(); + + EventFunctionWrapper nextTrafficTrackEvent; + void processNextTrafficTrackEvent(); + + struct RouterEngineStat : public statistics::Group + { + RouterEngineStat(RouterEngine &push); + + void regStats() override; + + RouterEngine &router; + + statistics::Vector internalBlockedTraffic; + statistics::Vector externalBlockedTraffic; + statistics::Vector internalAcceptedTraffic; + statistics::Vector externalAcceptedTraffic; + statistics::Vector bandwidthBlocked; + statistics::Vector totalInternalTraffic; + // statistics::Vector2d internalTrafficVector; + std::vector internalTrafficHist; + std::vector internalPortBW; + }; + RouterEngineStat stats; + public: + PARAMS(RouterEngine); + RouterEngine(const Params ¶ms); + void registerCenteralController(CenteralController* centeral_controller); + virtual void init() override; + virtual void startup() override; + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + + AddrRangeList getGPNRanges(); + AddrRangeList getGPTRanges(); + void recvReqRetry(); + + void checkGPTRetryReq(); + void checkGPNRetryReq(); + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__ diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md new file mode 100644 index 0000000000..203c47cf02 --- /dev/null +++ b/src/accl/graph/sega/state_machine.md @@ -0,0 +1 @@ +# CoalesceEngine Block state machine \ No newline at end of file diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc new file mode 100644 index 0000000000..8e5ccc9ebe --- /dev/null +++ b/src/accl/graph/sega/wl_engine.cc @@ -0,0 +1,499 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/wl_engine.hh" + +#include +#include +#include + +#include "accl/graph/sega/mpu.hh" +#include "debug/SEGAStructureSize.hh" +#include "debug/WLEngine.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +WLEngine::WLEngine(const WLEngineParams& params): + BaseReduceEngine(params), + updateQueueSize(params.update_queue_size), + examineWindow(params.examine_window), + maxReadsPerCycle(params.rd_per_cycle), + maxReducesPerCycle(params.reduce_per_cycle), + maxWritesPerCycle(params.wr_per_cycle), + registerFileSize(params.register_file_size), + nextReadEvent([this]{ processNextReadEvent(); }, name()), + nextReduceEvent([this]{ processNextReduceEvent(); }, name()), + nextWriteEvent([this] { processNextWriteEvent(); }, name()), + nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()), + stats(*this) +{ + for (int i = 0; i < params.port_in_ports_connection_count; ++i) { + inPorts.emplace_back( + name() + ".in_ports" + std::to_string(i), this, i); + } +} + +Port& +WLEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "in_ports") { + return inPorts[idx]; + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +WLEngine::init() +{ + for (int i = 0; i < inPorts.size(); i++){ + inPorts[i].sendRangeChange(); + } +} + +void +WLEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +AddrRangeList +WLEngine::getAddrRanges() +{ + return owner->getAddrRanges(); +} + +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +AddrRangeList +WLEngine::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +void +WLEngine::RespPort::checkRetryReq() +{ + if (needSendRetryReq) { + needSendRetryReq = false; + sendRetryReq(); + } +} + +bool +WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +{ + if (!owner->handleIncomingUpdate(pkt)) { + needSendRetryReq = true; + return false; + } + + return true; +} + +Tick +WLEngine::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +WLEngine::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +WLEngine::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +WLEngine::checkRetryReq() +{ + std::vector random_shuffle; + for (int i = 0; i < inPorts.size(); i++) { + random_shuffle.push_back(i); + } + std::random_device rd; + std::mt19937 gen(rd()); + std::shuffle(random_shuffle.begin(), random_shuffle.end(), gen); + + for (int i = 0; i < inPorts.size(); i++) { + inPorts[random_shuffle[i]].checkRetryReq(); + } +} + +bool +WLEngine::done() +{ + return registerFile.empty() && updateQueue.empty(); +} + +bool +WLEngine::handleIncomingUpdate(PacketPtr pkt) +{ + Addr update_addr = pkt->getAddr(); + uint32_t update_value = pkt->getLE(); + + if (valueMap.find(update_addr) != valueMap.end()) { + assert((updateQueueSize == 0) || + (updateQueue.size() <= updateQueueSize)); + DPRINTF(WLEngine, "%s: Found an already queued update to %u. ", + "Current value is: %u.\n", __func__, + update_addr, valueMap[update_addr]); + valueMap[update_addr] = + graphWorkload->reduce(update_value, valueMap[update_addr]); + stats.numIncomingUpdates++; + stats.updateQueueCoalescions++; + } else { + assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize)); + if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { + return false; + } else { + updateQueue.emplace_back(update_addr, curTick()); + valueMap[update_addr] = update_value; + stats.numIncomingUpdates++; + DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, update_addr, update_value, + updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, update_addr, update_value, + updateQueue.size(), updateQueueSize); + } + } + + // delete the packet since it's not needed anymore. + delete pkt; + + if (!nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); + } + return true; +} + +void +WLEngine::processNextReadEvent() +{ + std::deque> temp_queue; + for (int i = 0; i < examineWindow; i++) { + if (updateQueue.empty()) { + break; + } + temp_queue.push_back(updateQueue.front()); + updateQueue.pop_front(); + } + + int num_reads = 0; + int num_popped = 0; + int num_tries = 0; + int max_visits = temp_queue.size(); + while (true) { + Addr update_addr; + Tick enter_tick; + std::tie(update_addr, enter_tick) = temp_queue.front(); + + uint32_t update_value = valueMap[update_addr]; + DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. " + "(addr: %lu, value: %u).\n", __func__, update_addr, update_value); + if ((registerFile.find(update_addr) == registerFile.end())) { + DPRINTF(WLEngine, "%s: No register already allocated for addr: %lu " + "in registerFile.\n", __func__, update_addr); + if (registerFile.size() < registerFileSize) { + DPRINTF(WLEngine, "%s: There are free registers available in the " + "registerFile.\n", __func__); + ReadReturnStatus read_status = owner->recvWLRead(update_addr); + if (read_status == ReadReturnStatus::ACCEPT) { + DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read " + "request to addr: %lu.\n", __func__, update_addr); + registerFile[update_addr] = std::make_tuple(RegisterState::PENDING_READ, update_value); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + temp_queue.pop_front(); + valueMap.erase(update_addr); + num_reads++; + num_popped++; + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, temp_queue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + vertexReadTime[update_addr] = curTick(); + } else { + if (read_status == ReadReturnStatus::REJECT_ROLL) { + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + DPRINTF(WLEngine, "%s: Received a reject from cache. " + "Rolling the update.\n", __func__); + stats.numUpdateRolls++; + } else { + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + DPRINTF(WLEngine, "%s: Received a reject with no roll " + "from cache. Rolling the update anyway.\n", __func__); + } + } + } else { + DPRINTF(WLEngine, "%s: There are no free registers " + "available in the registerFile.\n", __func__); + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + stats.registerShortage++; + } + } else { + DPRINTF(WLEngine, "%s: A register has already been allocated for " + "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__, + update_addr, update_addr, std::get<1>(registerFile[update_addr])); + RegisterState state = std::get<0>(registerFile[update_addr]); + if (state == RegisterState::PENDING_WRITE) { + // NOTE: If it's pending write, let it be written. + DPRINTF(WLEngine, "%s: Respective register for addr: " + "%lu is pending a write to the cache. Rolling " + "the update.\n", __func__, update_addr); + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + } else { + uint32_t curr_value = std::get<1>(registerFile[update_addr]); + uint32_t new_value = graphWorkload->reduce(update_value, curr_value); + registerFile[update_addr] = std::make_tuple(state, new_value); + DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" + " registerFile. registerFile[%lu] = %u.\n", __func__, + update_value, update_addr, std::get<1>(registerFile[update_addr])); + stats.registerFileCoalescions++; + temp_queue.pop_front(); + valueMap.erase(update_addr); + num_popped++; + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + } + } + + num_tries++; + if (num_reads >= maxReadsPerCycle) { + if (!temp_queue.empty()) { + stats.numReadPortShortage++; + } + break; + } + if (num_tries >= max_visits) { + break; + } + if (temp_queue.empty()) { + break; + } + } + + while (!temp_queue.empty()) { + updateQueue.push_front(temp_queue.back()); + temp_queue.pop_back(); + } + if (num_popped > 0) { + checkRetryReq(); + } + if (!updateQueue.empty() && !nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); + } +} + +void +WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) +{ + assert(workListFile.size() <= registerFileSize); + assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_READ); + + workListFile[addr] = wl; + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + graphWorkload->printWorkListItem(wl), workListFile.size()); + DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + graphWorkload->printWorkListItem(wl), workListFile.size()); + + uint32_t value = std::get<1>(registerFile[addr]); + registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value); + toReduce.push_back(addr); + + stats.vertexReadLatency.sample( + ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency()); + vertexReadTime.erase(addr); + + if (!nextReduceEvent.scheduled()) { + schedule(nextReduceEvent, nextCycle()); + } +} + +void +WLEngine::processNextReduceEvent() +{ + int num_reduces = 0; + while (true) { + Addr addr = toReduce.front(); + assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE); + uint32_t update_value = std::get<1>(registerFile[addr]); + DPRINTF(WLEngine, "%s: Reducing for addr: %lu, update_value: %u, " + "temp_prop: %s.\n", __func__, addr, + update_value, workListFile[addr].tempProp); + workListFile[addr].tempProp = + graphWorkload->reduce(update_value, workListFile[addr].tempProp); + DPRINTF(WLEngine, "%s: Reduction result: %s", __func__, + graphWorkload->printWorkListItem(workListFile[addr])); + registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value); + num_reduces++; + stats.numReductions++; + toReduce.pop_front(); + toWrite.push_back(addr); + + if (num_reduces >= maxReducesPerCycle) { + if (!toReduce.empty()) { + stats.numReducerShortage++; + } + break; + } + if (toReduce.empty()) { + break; + } + } + + if (!toWrite.empty() && !nextWriteEvent.scheduled()) { + schedule(nextWriteEvent, nextCycle()); + } + + if (!toReduce.empty() && !nextReduceEvent.scheduled()) { + schedule(nextReduceEvent, nextCycle()); + } +} + +void +WLEngine::processNextWriteEvent() +{ + int num_writes = 0; + while (true) { + Addr addr = toWrite.front(); + assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_WRITE); + owner->recvWLWrite(addr, workListFile[addr]); + registerFile.erase(addr); + workListFile.erase(addr); + toWrite.pop_front(); + num_writes++; + if (num_writes >= maxWritesPerCycle) { + if (!toWrite.empty()) { + stats.numWritePortShortage++; + } + break; + } + if (toWrite.empty()) { + break; + } + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + + if (!toWrite.empty() && !nextWriteEvent.scheduled()) { + schedule(nextWriteEvent, nextCycle()); + } +} + +void +WLEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +WLEngine::WorkListStats::WorkListStats(WLEngine& _wl): + statistics::Group(&_wl), wl(_wl), + ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), + "Number of coalescions in the update queues."), + ADD_STAT(registerShortage, statistics::units::Count::get(), + "Number of times updates were " + "stalled because of register shortage"), + ADD_STAT(numUpdateRolls, statistics::units::Count::get(), + "Number of times an update has been rolled back " + "to the back of the update queue due to cache reject."), + ADD_STAT(numReadPortShortage, statistics::units::Count::get(), + "Number of times limited by read per cycle."), + ADD_STAT(registerFileCoalescions, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(numReductions, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(numReducerShortage, statistics::units::Count::get(), + "Number of times limited by number of reducers."), + ADD_STAT(numWritePortShortage, statistics::units::Count::get(), + "Number of times limited by write per cycle."), + ADD_STAT(numIncomingUpdates, statistics::units::Count::get(), + "Number of inocoming updates for each GPT."), + ADD_STAT(vertexReadLatency, statistics::units::Second::get(), + "Histogram of the latency of reading a vertex (ns)."), + ADD_STAT(updateQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of dequeuing an update (ns).") +{ +} + +void +WLEngine::WorkListStats::regStats() +{ + using namespace statistics; + + vertexReadLatency.init(64); + updateQueueLatency.init(64); + +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh new file mode 100644 index 0000000000..ad67f19cb5 --- /dev/null +++ b/src/accl/graph/sega/wl_engine.hh @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ + +#include +#include + +#include "accl/graph/base/base_reduce_engine.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/enums.hh" +#include "base/statistics.hh" +#include "params/WLEngine.hh" + +namespace gem5 +{ + +class MPU; + +class WLEngine : public BaseReduceEngine +{ + private: + class RespPort : public ResponsePort + { + private: + WLEngine* owner; + bool needSendRetryReq; + PortID _id; + + public: + RespPort(const std::string& name, WLEngine* owner, PortID id): + ResponsePort(name, owner), + owner(owner), needSendRetryReq(false), _id(id) + {} + virtual AddrRangeList getAddrRanges() const; + + PortID id() { return _id; } + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + MPU* owner; + GraphWorkload* graphWorkload; + + std::vector inPorts; + + int updateQueueSize; + std::deque> updateQueue; + std::unordered_map valueMap; + + int examineWindow; + int maxReadsPerCycle; + int maxReducesPerCycle; + int maxWritesPerCycle; + + int registerFileSize; + std::unordered_map> registerFile; + std::unordered_map workListFile; + std::deque toReduce; + std::deque toWrite; + + std::unordered_map vertexReadTime; + + EventFunctionWrapper nextReadEvent; + void processNextReadEvent(); + + EventFunctionWrapper nextReduceEvent; + void processNextReduceEvent(); + + EventFunctionWrapper nextWriteEvent; + void processNextWriteEvent(); + + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + + struct WorkListStats : public statistics::Group + { + WorkListStats(WLEngine& worklist); + + void regStats() override; + + WLEngine &wl; + statistics::Scalar updateQueueCoalescions; + statistics::Scalar registerShortage; + statistics::Scalar numUpdateRolls; + statistics::Scalar numReadPortShortage; + statistics::Scalar registerFileCoalescions; + statistics::Scalar numReductions; + statistics::Scalar numReducerShortage; + statistics::Scalar numWritePortShortage; + statistics::Scalar numIncomingUpdates; + + statistics::Histogram vertexReadLatency; + statistics::Histogram updateQueueLatency; + }; + + WorkListStats stats; + + public: + PARAMS(WLEngine); + WLEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerMPU(MPU* mpu); + + AddrRangeList getAddrRanges(); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + void recvFunctional(PacketPtr pkt); + + bool handleIncomingUpdate(PacketPtr pkt); + void handleIncomingWL(Addr addr, WorkListItem wl); + + void checkRetryReq(); + + bool done(); +}; + +} +#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh new file mode 100644 index 0000000000..620e97f654 --- /dev/null +++ b/src/accl/graph/sega/work_directory.hh @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ +#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "base/addr_range.hh" +#include "base/types.hh" + +namespace gem5 +{ + +class WorkDirectory +{ + public: + virtual int activate(Addr atom_addr) = 0; + virtual int deactivate(Addr atom_addr) = 0; + virtual Addr getNextWork() = 0; + + virtual int workCount() = 0; + bool empty() { return workCount() == 0; } + + virtual void setLastAtomAddr(Addr atom_addr) = 0; +}; + +class PopCountDirectory: public WorkDirectory +{ + private: + AddrRange memoryRange; + + int numAtomsPerBlock; + int memoryAtomSize; + int blockSize; + + uint32_t _workCount; + + int numCounters; + int lastCounterIndex; + uint32_t* popCount; + + uint32_t prevIndex; + uint32_t currentCounter; + + UniqueFIFO activeBlockIndices; + + int getIndexFromAtomAddr(Addr atom_addr) + { + assert((atom_addr % memoryAtomSize) == 0); + Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr); + int index = (int) (trimmed_addr / blockSize); + return index; + } + + Addr getAtomAddrFromIndex(int block_index, int atom_index) + { + Addr block_addr = block_index * blockSize; + Addr trimmed_addr = block_addr + atom_index * memoryAtomSize; + return memoryRange.addIntlvBits(trimmed_addr); + } + + public: + PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size): + WorkDirectory(), + memoryRange(mem_range), numAtomsPerBlock(atoms_per_block), + memoryAtomSize(atom_size), _workCount(0), + prevIndex(-1), currentCounter(0) + { + blockSize = numAtomsPerBlock * memoryAtomSize; + int numCounters = (int) (memoryRange.size() / blockSize); + lastCounterIndex = numCounters - 1; + popCount = new uint32_t [numCounters]; + for (int index = 0; index < numCounters; index++) { + popCount[index] = 0; + } + activeBlockIndices = UniqueFIFO(numCounters); + } + + // CAUTION: This should only be called when the work + // directory **is not** tracking the the atom with atom_addr + virtual int activate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]++; + _workCount++; + activeBlockIndices.push_back(index); + assert(popCount[index] > prev_count); + assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; + } + + // CAUTION: This should only be called when the work + // directory **is** tracking the the atom with atom_addr + virtual int deactivate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]--; + _workCount--; + if (popCount[index] == 0) { + activeBlockIndices.erase(index); + } + assert(popCount[index] < prev_count); + assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; + } + + virtual int workCount() { return _workCount; } + + void setLastAtomAddr(Addr atom_addr) + { + lastCounterIndex = getIndexFromAtomAddr(atom_addr); + } + + // CAUTION: This directory only tracks active vertices in the memory + // and it does not have any information on the state of the cache and/or + // the active buffer or the write buffer. Therefore, it might generate a + // read request to an address that might be in any of those. In that case, + // the generated address should be ignored. + virtual Addr getNextWork() + { + // Why ask directory if it's empty? + assert(!activeBlockIndices.empty()); + int front_index = activeBlockIndices.front(); + assert(popCount[front_index] > 0); + if ((prevIndex != -1) && (prevIndex != front_index)) { + currentCounter = 0; + } + if (currentCounter == numAtomsPerBlock) { + currentCounter = 0; + activeBlockIndices.pop_front(); + activeBlockIndices.push_back(front_index); + } + int current_index = activeBlockIndices.front(); + Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter); + prevIndex = current_index; + currentCounter++; + return ret_addr; + } +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh index 07bd255d26..3c5c150b29 100644 --- a/src/base/addr_range.hh +++ b/src/base/addr_range.hh @@ -48,6 +48,7 @@ #include "base/bitfield.hh" #include "base/cprintf.hh" +#include "base/intmath.hh" #include "base/logging.hh" #include "base/types.hh" @@ -732,6 +733,40 @@ class AddrRange { return !(*this == r); } + + friend AddrRange + mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit) + { + assert(left.interleaved()); + assert(right.interleaved()); + assert(left.mergesWith(right)); + + uint8_t old_left_match = left.intlvMatch; + uint8_t new_left_match = 0; + uint8_t old_right_match = right.intlvMatch; + uint8_t new_right_match = 0; + int new_bits = left.masks.size() - 1; + + // assumption: masks is sorted in ascending order + std::vector new_masks; + for (auto mask: left.masks) { + uint64_t lsb_mask = (mask ^ (mask - 1)) + 1; + if ((lsb_mask >> 1) != (1 << pch_bit)) { + new_masks.push_back(mask); + new_left_match |= ((old_left_match & 1) << new_bits); + new_left_match >>= 1; + new_right_match |= ((old_right_match & 1) << new_bits); + new_right_match >>= 1; + } + old_left_match >>= 1; + old_right_match >>= 1; + } + panic_if(new_left_match != new_right_match, + "The two ranges can not be a pseudo channel pair " + "given the pseudochannel bit position of params.pch_bit."); + + return AddrRange(left._start, left._end, new_masks, new_left_match); + } }; static inline AddrRangeList @@ -817,6 +852,16 @@ RangeSize(Addr start, Addr size) return AddrRange(start, start + size); } +inline bool +contains(AddrRangeList range_list, Addr addr) +{ + bool ret = false; + for (auto range: range_list) { + ret |= range.contains(addr); + } + return ret; +} + } // namespace gem5 #endif // __BASE_ADDR_RANGE_HH__ diff --git a/src/base/statistics.hh b/src/base/statistics.hh index 24cbf714f5..15aeff892e 100644 --- a/src/base/statistics.hh +++ b/src/base/statistics.hh @@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec Proxy operator[](off_type index) { - assert (index < size()); + // assert (index < size()); return Proxy(this->self(), index); } }; diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py index 0c7c1ea919..f32ffe6f0a 100644 --- a/src/mem/HBMCtrl.py +++ b/src/mem/HBMCtrl.py @@ -42,9 +42,9 @@ class HBMCtrl(MemCtrl): # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces dram_2 = Param.DRAMInterface("DRAM memory interface") + pch_bit = Param.Int("Position of PseudoChannel bit in addresses.") + # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces # gives the best results with following min_r/w_per_switch min_reads_per_switch = 64 min_writes_per_switch = 64 - - partitioned_q = Param.Bool(False, "split queues for pseudo channels") diff --git a/src/mem/dram_interface.cc b/src/mem/dram_interface.cc index d745fe5a29..d8c6da0a2d 100644 --- a/src/mem/dram_interface.cc +++ b/src/mem/dram_interface.cc @@ -1068,13 +1068,14 @@ DRAMInterface::minBankPrep(const MemPacketQueue& queue, // latest Tick for which ACT can occur without // incurring additoinal delay on the data bus - const Tick tRCD = ctrl->inReadBusState(false) ? - tRCD_RD : tRCD_WR; + const Tick tRCD = ctrl->inReadBusState(false, + (MemInterface*)(this)) ? tRCD_RD : tRCD_WR; const Tick hidden_act_max = std::max(min_col_at - tRCD, curTick()); // When is the earliest the R/W burst can issue? - const Tick col_allowed_at = ctrl->inReadBusState(false) ? + const Tick col_allowed_at = ctrl->inReadBusState(false, + (MemInterface*)(this)) ? ranks[i]->banks[j].rdAllowedAt : ranks[i]->banks[j].wrAllowedAt; Tick col_at = std::max(col_allowed_at, act_at + tRCD); @@ -1180,10 +1181,10 @@ bool DRAMInterface::Rank::isQueueEmpty() const { // check commmands in Q based on current bus direction - bool no_queued_cmds = (dram.ctrl->inReadBusState(true) && - (readEntries == 0)) - || (dram.ctrl->inWriteBusState(true) && - (writeEntries == 0)); + bool no_queued_cmds = (dram.ctrl->inReadBusState(true, + (MemInterface*)(this)) && (readEntries == 0)) || + (dram.ctrl->inWriteBusState(true, + (MemInterface*)(this)) && (writeEntries == 0)); return no_queued_cmds; } @@ -1669,7 +1670,7 @@ DRAMInterface::Rank::processPowerEvent() // completed refresh event, ensure next request is scheduled if (!(dram.ctrl->requestEventScheduled(dram.pseudoChannel))) { DPRINTF(DRAM, "Scheduling next request after refreshing" - " rank %d\n", rank); + " rank %d, PC %d \n", rank, dram.pseudoChannel); dram.ctrl->restartScheduler(curTick(), dram.pseudoChannel); } } @@ -1831,7 +1832,8 @@ DRAMInterface::Rank::resetStats() { bool DRAMInterface::Rank::forceSelfRefreshExit() const { return (readEntries != 0) || - (dram.ctrl->inWriteBusState(true) && (writeEntries != 0)); + (dram.ctrl->inWriteBusState(true, (MemInterface*)(this)) + && (writeEntries != 0)); } void diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc index 99618c4b5f..e0d0922333 100644 --- a/src/mem/hbm_ctrl.cc +++ b/src/mem/hbm_ctrl.cc @@ -45,14 +45,14 @@ namespace memory HBMCtrl::HBMCtrl(const HBMCtrlParams &p) : MemCtrl(p), + pchBit(p.pch_bit), retryRdReqPC1(false), retryWrReqPC1(false), nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1, respondEventPC1, nextReqEventPC1, retryWrReqPC1);}, name()), respondEventPC1([this] {processRespondEvent(pc1Int, respQueuePC1, respondEventPC1, retryRdReqPC1); }, name()), - pc1Int(p.dram_2), - partitionedQ(p.partitioned_q) + pc1Int(p.dram_2) { DPRINTF(MemCtrl, "Setting up HBM controller\n"); @@ -69,17 +69,8 @@ HBMCtrl::HBMCtrl(const HBMCtrlParams &p) : pc0Int->setCtrl(this, commandWindow, 0); pc1Int->setCtrl(this, commandWindow, 1); - if (partitionedQ) { - writeHighThreshold = (writeBufferSize * (p.write_high_thresh_perc/2) - / 100.0); - writeLowThreshold = (writeBufferSize * (p.write_low_thresh_perc/2) - / 100.0); - } else { - writeHighThreshold = (writeBufferSize * p.write_high_thresh_perc - / 100.0); - writeLowThreshold = (writeBufferSize * p.write_low_thresh_perc - / 100.0); - } + writeHighThreshold = (writeBufferSize/2 * p.write_high_thresh_perc)/100.0; + writeLowThreshold = (writeBufferSize/2 * p.write_low_thresh_perc)/100.0; } void @@ -155,9 +146,9 @@ HBMCtrl::writeQueueFullPC0(unsigned int neededEntries) const { DPRINTF(MemCtrl, "Write queue limit %d, PC0 size %d, entries needed %d\n", - writeBufferSize, writeQueueSizePC0, neededEntries); + writeBufferSize/2, pc0Int->writeQueueSize, neededEntries); - unsigned int wrsize_new = (writeQueueSizePC0 + neededEntries); + unsigned int wrsize_new = (pc0Int->writeQueueSize + neededEntries); return wrsize_new > (writeBufferSize/2); } @@ -166,9 +157,9 @@ HBMCtrl::writeQueueFullPC1(unsigned int neededEntries) const { DPRINTF(MemCtrl, "Write queue limit %d, PC1 size %d, entries needed %d\n", - writeBufferSize, writeQueueSizePC1, neededEntries); + writeBufferSize/2, pc1Int->writeQueueSize, neededEntries); - unsigned int wrsize_new = (writeQueueSizePC1 + neededEntries); + unsigned int wrsize_new = (pc1Int->writeQueueSize + neededEntries); return wrsize_new > (writeBufferSize/2); } @@ -177,10 +168,10 @@ HBMCtrl::readQueueFullPC0(unsigned int neededEntries) const { DPRINTF(MemCtrl, "Read queue limit %d, PC0 size %d, entries needed %d\n", - readBufferSize, readQueueSizePC0 + respQueue.size(), + readBufferSize/2, pc0Int->readQueueSize + respQueue.size(), neededEntries); - unsigned int rdsize_new = readQueueSizePC0 + respQueue.size() + unsigned int rdsize_new = pc0Int->readQueueSize + respQueue.size() + neededEntries; return rdsize_new > (readBufferSize/2); } @@ -190,26 +181,14 @@ HBMCtrl::readQueueFullPC1(unsigned int neededEntries) const { DPRINTF(MemCtrl, "Read queue limit %d, PC1 size %d, entries needed %d\n", - readBufferSize, readQueueSizePC1 + respQueuePC1.size(), + readBufferSize/2, pc1Int->readQueueSize + respQueuePC1.size(), neededEntries); - unsigned int rdsize_new = readQueueSizePC1 + respQueuePC1.size() + unsigned int rdsize_new = pc1Int->readQueueSize + respQueuePC1.size() + neededEntries; return rdsize_new > (readBufferSize/2); } -bool -HBMCtrl::readQueueFull(unsigned int neededEntries) const -{ - DPRINTF(MemCtrl, - "HBMCtrl: Read queue limit %d, entries needed %d\n", - readBufferSize, neededEntries); - - unsigned int rdsize_new = totalReadQueueSize + respQueue.size() + - respQueuePC1.size() + neededEntries; - return rdsize_new > readBufferSize; -} - bool HBMCtrl::recvTimingReq(PacketPtr pkt) { @@ -233,7 +212,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) bool is_pc0; // TODO: make the interleaving bit across pseudo channels a parameter - if (bits(pkt->getAddr(), 6) == 0) { + if (bits(pkt->getAddr(), pchBit) == 0) { is_pc0 = true; } else { is_pc0 = false; @@ -254,9 +233,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) // check local buffers and do not accept if full if (pkt->isWrite()) { if (is_pc0) { - if (partitionedQ ? writeQueueFullPC0(pkt_count) : - writeQueueFull(pkt_count)) - { + if (writeQueueFullPC0(pkt_count)) { DPRINTF(MemCtrl, "Write queue full, not accepting\n"); // remember that we have to retry this port MemCtrl::retryWrReq = true; @@ -264,13 +241,15 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) return false; } else { addToWriteQueue(pkt, pkt_count, pc0Int); + if (!nextReqEvent.scheduled()) { + DPRINTF(MemCtrl, "Request scheduled immediately\n"); + schedule(nextReqEvent, curTick()); + } stats.writeReqs++; stats.bytesWrittenSys += size; } } else { - if (partitionedQ ? writeQueueFullPC1(pkt_count) : - writeQueueFull(pkt_count)) - { + if (writeQueueFullPC1(pkt_count)) { DPRINTF(MemCtrl, "Write queue full, not accepting\n"); // remember that we have to retry this port retryWrReqPC1 = true; @@ -278,6 +257,10 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) return false; } else { addToWriteQueue(pkt, pkt_count, pc1Int); + if (!nextReqEventPC1.scheduled()) { + DPRINTF(MemCtrl, "Request scheduled immediately\n"); + schedule(nextReqEventPC1, curTick()); + } stats.writeReqs++; stats.bytesWrittenSys += size; } @@ -288,11 +271,10 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) assert(size != 0); if (is_pc0) { - if (partitionedQ ? readQueueFullPC0(pkt_count) : - HBMCtrl::readQueueFull(pkt_count)) { + if (readQueueFullPC0(pkt_count)) { DPRINTF(MemCtrl, "Read queue full, not accepting\n"); // remember that we have to retry this port - retryRdReqPC1 = true; + MemCtrl::retryRdReq = true; stats.numRdRetry++; return false; } else { @@ -307,8 +289,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) stats.bytesReadSys += size; } } else { - if (partitionedQ ? readQueueFullPC1(pkt_count) : - HBMCtrl::readQueueFull(pkt_count)) { + if (readQueueFullPC1(pkt_count)) { DPRINTF(MemCtrl, "Read queue full, not accepting\n"); // remember that we have to retry this port retryRdReqPC1 = true; @@ -492,8 +473,11 @@ AddrRangeList HBMCtrl::getAddrRanges() { AddrRangeList ranges; - ranges.push_back(pc0Int->getAddrRange()); - ranges.push_back(pc1Int->getAddrRange()); + AddrRange pc0Int_range = pc0Int->getAddrRange(); + AddrRange pc1Int_range = pc1Int->getAddrRange(); + ranges.push_back( + mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit) + ); return ranges; } diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh index c9045f0ae7..58cbd57c3b 100644 --- a/src/mem/hbm_ctrl.hh +++ b/src/mem/hbm_ctrl.hh @@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl } private: - + // Position of the pseudochannel bit in addresses. + int pchBit; /** * Remember if we have to retry a request for second pseudo channel. */ @@ -144,7 +145,6 @@ class HBMCtrl : public MemCtrl */ bool readQueueFullPC0(unsigned int pkt_count) const; bool readQueueFullPC1(unsigned int pkt_count) const; - bool readQueueFull(unsigned int pkt_count) const; /** * Check if the write queue partition of both pseudo diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc index c65d68a5a7..731ce7be39 100644 --- a/src/mem/mem_ctrl.cc +++ b/src/mem/mem_ctrl.cc @@ -72,7 +72,6 @@ MemCtrl::MemCtrl(const MemCtrlParams &p) : writeLowThreshold(writeBufferSize * p.write_low_thresh_perc / 100.0), minWritesPerSwitch(p.min_writes_per_switch), minReadsPerSwitch(p.min_reads_per_switch), - writesThisTime(0), readsThisTime(0), memSchedPolicy(p.mem_sched_policy), frontendLatency(p.static_frontend_latency), backendLatency(p.static_backend_latency), @@ -212,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt, for (int cnt = 0; cnt < pkt_count; ++cnt) { unsigned size = std::min((addr | (burst_size - 1)) + 1, base_addr + pkt->getSize()) - addr; - stats.readPktSize[ceilLog2(size)]++; + // stats.readPktSize[ceilLog2(size)]++; stats.readBursts++; stats.requestorReadAccesses[pkt->requestorId()]++; @@ -277,6 +276,8 @@ MemCtrl::addToReadQueue(PacketPtr pkt, logRequest(MemCtrl::READ, pkt->requestorId(), pkt->qosValue(), mem_pkt->addr, 1); + mem_intr->readQueueSize++; + // Update stats stats.avgRdQLen = totalReadQueueSize + respQueue.size(); } @@ -349,6 +350,8 @@ MemCtrl::addToWriteQueue(PacketPtr pkt, unsigned int pkt_count, logRequest(MemCtrl::WRITE, pkt->requestorId(), pkt->qosValue(), mem_pkt->addr, 1); + mem_intr->writeQueueSize++; + assert(totalWriteQueueSize == isInWriteQueue.size()); // Update stats @@ -575,6 +578,9 @@ MemCtrl::chooseNext(MemPacketQueue& queue, Tick extra_col_delay, // check if there is a packet going to a free rank for (auto i = queue.begin(); i != queue.end(); ++i) { MemPacket* mem_pkt = *i; + if (mem_pkt->pseudoChannel != mem_intr->pseudoChannel) { + continue; + } if (packetReady(mem_pkt, mem_intr)) { ret = i; break; @@ -761,28 +767,28 @@ MemCtrl::verifyMultiCmd(Tick cmd_tick, Tick max_cmds_per_burst, } bool -MemCtrl::inReadBusState(bool next_state) const +MemCtrl::inReadBusState(bool next_state, MemInterface* mem_intr) const { // check the bus state if (next_state) { // use busStateNext to get the state that will be used // for the next burst - return (busStateNext == MemCtrl::READ); + return (mem_intr->busStateNext == MemCtrl::READ); } else { - return (busState == MemCtrl::READ); + return (mem_intr->busState == MemCtrl::READ); } } bool -MemCtrl::inWriteBusState(bool next_state) const +MemCtrl::inWriteBusState(bool next_state, MemInterface* mem_intr) const { // check the bus state if (next_state) { // use busStateNext to get the state that will be used // for the next burst - return (busStateNext == MemCtrl::WRITE); + return (mem_intr->busStateNext == MemCtrl::WRITE); } else { - return (busState == MemCtrl::WRITE); + return (mem_intr->busState == MemCtrl::WRITE); } } @@ -813,13 +819,13 @@ MemCtrl::doBurstAccess(MemPacket* mem_pkt, MemInterface* mem_intr) // Update the common bus stats if (mem_pkt->isRead()) { - ++readsThisTime; + ++(mem_intr->readsThisTime); // Update latency stats stats.requestorReadTotalLat[mem_pkt->requestorId()] += mem_pkt->readyTime - mem_pkt->entryTime; stats.requestorReadBytes[mem_pkt->requestorId()] += mem_pkt->size; } else { - ++writesThisTime; + ++(mem_intr->writesThisTime); stats.requestorWriteBytes[mem_pkt->requestorId()] += mem_pkt->size; stats.requestorWriteTotalLat[mem_pkt->requestorId()] += mem_pkt->readyTime - mem_pkt->entryTime; @@ -836,8 +842,8 @@ MemCtrl::memBusy(MemInterface* mem_intr) { // Default to busy status and update based on interface specifics // Default state of unused interface is 'true' bool mem_busy = true; - bool all_writes_nvm = mem_intr->numWritesQueued == totalWriteQueueSize; - bool read_queue_empty = totalReadQueueSize == 0; + bool all_writes_nvm = mem_intr->numWritesQueued == mem_intr->writeQueueSize; + bool read_queue_empty = mem_intr->readQueueSize == 0; mem_busy = mem_intr->isBusy(read_queue_empty, all_writes_nvm); if (mem_busy) { // if all ranks are refreshing wait for them to finish @@ -884,32 +890,32 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, } // detect bus state change - bool switched_cmd_type = (busState != busStateNext); + bool switched_cmd_type = (mem_intr->busState != mem_intr->busStateNext); // record stats recordTurnaroundStats(); DPRINTF(MemCtrl, "QoS Turnarounds selected state %s %s\n", - (busState==MemCtrl::READ)?"READ":"WRITE", + (mem_intr->busState==MemCtrl::READ)?"READ":"WRITE", switched_cmd_type?"[turnaround triggered]":""); if (switched_cmd_type) { - if (busState == MemCtrl::READ) { + if (mem_intr->busState == MemCtrl::READ) { DPRINTF(MemCtrl, "Switching to writes after %d reads with %d reads " - "waiting\n", readsThisTime, totalReadQueueSize); - stats.rdPerTurnAround.sample(readsThisTime); - readsThisTime = 0; + "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize); + stats.rdPerTurnAround.sample(mem_intr->readsThisTime); + mem_intr->readsThisTime = 0; } else { DPRINTF(MemCtrl, "Switching to reads after %d writes with %d writes " - "waiting\n", writesThisTime, totalWriteQueueSize); - stats.wrPerTurnAround.sample(writesThisTime); - writesThisTime = 0; + "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize); + stats.wrPerTurnAround.sample(mem_intr->writesThisTime); + mem_intr->writesThisTime = 0; } } // updates current state - busState = busStateNext; + mem_intr->busState = mem_intr->busStateNext; nonDetermReads(mem_intr); @@ -918,18 +924,18 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, } // when we get here it is either a read or a write - if (busState == READ) { + if (mem_intr->busState == READ) { // track if we should switch or not bool switch_to_writes = false; - if (totalReadQueueSize == 0) { + if (mem_intr->readQueueSize == 0) { // In the case there is no read request to go next, // trigger writes if we have passed the low threshold (or // if we are draining) - if (!(totalWriteQueueSize == 0) && + if (!(mem_intr->writeQueueSize == 0) && (drainState() == DrainState::Draining || - totalWriteQueueSize > writeLowThreshold)) { + mem_intr->writeQueueSize > writeLowThreshold)) { DPRINTF(MemCtrl, "Switching to writes due to read queue empty\n"); @@ -1004,6 +1010,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, mem_pkt->qosValue(), mem_pkt->getAddr(), 1, mem_pkt->readyTime - mem_pkt->entryTime); + mem_intr->readQueueSize--; // Insert into response queue. It will be sent back to the // requestor at its readyTime @@ -1022,8 +1029,8 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, // there are no other writes that can issue // Also ensure that we've issued a minimum defined number // of reads before switching, or have emptied the readQ - if ((totalWriteQueueSize > writeHighThreshold) && - (readsThisTime >= minReadsPerSwitch || totalReadQueueSize == 0) + if ((mem_intr->writeQueueSize > writeHighThreshold) && + (mem_intr->readsThisTime >= minReadsPerSwitch || mem_intr->readQueueSize == 0) && !(nvmWriteBlock(mem_intr))) { switch_to_writes = true; } @@ -1038,7 +1045,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, // draining), or because the writes hit the hight threshold if (switch_to_writes) { // transition to writing - busStateNext = WRITE; + mem_intr->busStateNext = WRITE; } } else { @@ -1092,6 +1099,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, mem_pkt->qosValue(), mem_pkt->getAddr(), 1, mem_pkt->readyTime - mem_pkt->entryTime); + mem_intr->writeQueueSize--; // remove the request from the queue - the iterator is no longer valid writeQueue[mem_pkt->qosValue()].erase(to_write); @@ -1105,15 +1113,15 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, // If we are interfacing to NVM and have filled the writeRespQueue, // with only NVM writes in Q, then switch to reads bool below_threshold = - totalWriteQueueSize + minWritesPerSwitch < writeLowThreshold; + mem_intr->writeQueueSize + minWritesPerSwitch < writeLowThreshold; - if (totalWriteQueueSize == 0 || + if (mem_intr->writeQueueSize == 0 || (below_threshold && drainState() != DrainState::Draining) || - (totalReadQueueSize && writesThisTime >= minWritesPerSwitch) || - (totalReadQueueSize && (nvmWriteBlock(mem_intr)))) { + (mem_intr->readQueueSize && mem_intr->writesThisTime >= minWritesPerSwitch) || + (mem_intr->readQueueSize && (nvmWriteBlock(mem_intr)))) { // turn the bus back around for reads again - busStateNext = MemCtrl::READ; + mem_intr->busStateNext = MemCtrl::READ; // note that the we switch back to reads also in the idle // case, which eventually will check for any draining and @@ -1126,7 +1134,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr, if (!next_req_event.scheduled()) schedule(next_req_event, std::max(mem_intr->nextReqTime, curTick())); - if (retry_wr_req && totalWriteQueueSize < writeBufferSize) { + if (retry_wr_req && mem_intr->writeQueueSize < writeBufferSize) { retry_wr_req = false; port.sendRetryReq(); } @@ -1400,7 +1408,7 @@ MemCtrl::drain() { // if there is anything in any of our internal queues, keep track // of that as well - if (!(!totalWriteQueueSize && !totalReadQueueSize && respQueue.empty() && + if (!(!totalWriteQueueSize && !totalReadQueueSize && respQEmpty() && allIntfDrained())) { DPRINTF(Drain, "Memory controller not drained, write: %d, read: %d," diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh index fe5d478280..fffd05405e 100644 --- a/src/mem/mem_ctrl.hh +++ b/src/mem/mem_ctrl.hh @@ -515,8 +515,6 @@ class MemCtrl : public qos::MemCtrl uint32_t writeLowThreshold; const uint32_t minWritesPerSwitch; const uint32_t minReadsPerSwitch; - uint32_t writesThisTime; - uint32_t readsThisTime; /** * Memory controller configuration initialized based on parameter @@ -762,7 +760,7 @@ class MemCtrl : public qos::MemCtrl * @param next_state Check either the current or next bus state * @return True when bus is currently in a read state */ - bool inReadBusState(bool next_state) const; + bool inReadBusState(bool next_state, MemInterface* mem_intr) const; /** * Check the current direction of the memory channel @@ -770,7 +768,7 @@ class MemCtrl : public qos::MemCtrl * @param next_state Check either the current or next bus state * @return True when bus is currently in a write state */ - bool inWriteBusState(bool next_state) const; + bool inWriteBusState(bool next_state, MemInterface* mem_intr) const; Port &getPort(const std::string &if_name, PortID idx=InvalidPortID) override; diff --git a/src/mem/mem_interface.hh b/src/mem/mem_interface.hh index 8d6f4fe52b..b0f762fc80 100644 --- a/src/mem/mem_interface.hh +++ b/src/mem/mem_interface.hh @@ -189,6 +189,28 @@ class MemInterface : public AbstractMemory Tick nextBurstAt = 0; Tick nextReqTime = 0; + /** + * Reads/writes performed by the controller for this interface before + * bus direction is switched + */ + uint32_t readsThisTime = 0; + uint32_t writesThisTime = 0; + + /** + * Read/write packets in the read/write queue for this interface + * qos/mem_ctrl.hh has similar counters, but they track all packets + * in the controller for all memory interfaces connected to the + * controller. + */ + uint32_t readQueueSize = 0; + uint32_t writeQueueSize = 0; + + + MemCtrl::BusState busState = MemCtrl::READ; + + /** bus state for next request event triggered */ + MemCtrl::BusState busStateNext = MemCtrl::READ; + /** * pseudo channel number used for HBM modeling */ diff --git a/src/mem/nvm_interface.cc b/src/mem/nvm_interface.cc index b2c4073cd9..e77cf59202 100644 --- a/src/mem/nvm_interface.cc +++ b/src/mem/nvm_interface.cc @@ -402,9 +402,11 @@ NVMInterface::processReadReadyEvent() bool NVMInterface::burstReady(MemPacket* pkt) const { - bool read_rdy = pkt->isRead() && (ctrl->inReadBusState(true)) && - (pkt->readyTime <= curTick()) && (numReadDataReady > 0); - bool write_rdy = !pkt->isRead() && !ctrl->inReadBusState(true) && + bool read_rdy = pkt->isRead() && (ctrl->inReadBusState(true, + (MemInterface*)(this))) && + (pkt->readyTime <= curTick()) && (numReadDataReady > 0); + bool write_rdy = !pkt->isRead() && !ctrl->inReadBusState(true, + (MemInterface*)(this)) && !writeRespQueueFull(); return (read_rdy || write_rdy); } @@ -613,7 +615,7 @@ NVMInterface::isBusy(bool read_queue_empty, bool all_writes_nvm) // Only assert busy for the write case when there are also // no reads in Q and the write queue only contains NVM commands // This allows the bus state to switch and service reads - return (ctrl->inReadBusState(true) ? + return (ctrl->inReadBusState(true, (MemInterface*)(this)) ? (numReadDataReady == 0) && !read_queue_empty : writeRespQueueFull() && read_queue_empty && all_writes_nvm); diff --git a/src/mem/packet.cc b/src/mem/packet.cc index 31dc330cab..daf9d18e88 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -237,6 +237,7 @@ MemCmd::commandInfo[] = { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" }, { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" }, { {IsRequest}, InvalidCmd, "TlbiExtSync" }, + { {IsRequest, HasData}, InvalidCmd, "UpdateWL"} }; AddrRange diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 9238dbec00..5332ee32a2 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -148,6 +148,8 @@ class MemCmd HTMAbort, // Tlb shootdown TlbiExtSync, + // MPU Accelerator + UpdateWL, NUM_MEM_CMDS }; diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc index 19e1a53e84..55145ab7d7 100644 --- a/src/mem/port_proxy.cc +++ b/src/mem/port_proxy.cc @@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) : void PortProxy::readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const + void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags, void PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const + const void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, void PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const + uint8_t v, Addr size) const { // quick and dirty... uint8_t *buf = new uint8_t[size]; diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh index 29f6ba60a4..8cd21322ea 100644 --- a/src/mem/port_proxy.hh +++ b/src/mem/port_proxy.hh @@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol * Read size bytes memory at physical address and store in p. */ void readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const; + void *p, Addr size) const; /** * Write size bytes from p to physical address. */ void writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const; + const void *p, Addr size) const; /** * Fill size bytes starting at physical addr with byte value val. */ void memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const; + uint8_t v, Addr size) const; @@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryReadBlob(Addr addr, void *p, int size) const + tryReadBlob(Addr addr, void *p, Addr size) const { readBlobPhys(addr, 0, p, size); return true; @@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryWriteBlob(Addr addr, const void *p, int size) const + tryWriteBlob(Addr addr, const void *p, Addr size) const { writeBlobPhys(addr, 0, p, size); return true; @@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryMemsetBlob(Addr addr, uint8_t val, int size) const + tryMemsetBlob(Addr addr, uint8_t val, Addr size) const { memsetBlobPhys(addr, 0, val, size); return true; @@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryReadBlob, but insists on success. */ void - readBlob(Addr addr, void *p, int size) const + readBlob(Addr addr, void *p, Addr size) const { if (!tryReadBlob(addr, p, size)) fatal("readBlob(%#x, ...) failed", addr); @@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryWriteBlob, but insists on success. */ void - writeBlob(Addr addr, const void *p, int size) const + writeBlob(Addr addr, const void *p, Addr size) const { if (!tryWriteBlob(addr, p, size)) fatal("writeBlob(%#x, ...) failed", addr); @@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryMemsetBlob, but insists on success. */ void - memsetBlob(Addr addr, uint8_t v, int size) const + memsetBlob(Addr addr, uint8_t v, Addr size) const { if (!tryMemsetBlob(addr, v, size)) fatal("memsetBlob(%#x, ...) failed", addr); diff --git a/src/mem/simple_mem.hh b/src/mem/simple_mem.hh index fc6d6849d5..f57ef33629 100644 --- a/src/mem/simple_mem.hh +++ b/src/mem/simple_mem.hh @@ -178,7 +178,6 @@ class SimpleMemory : public AbstractMemory std::unique_ptr pendingDelete; public: - SimpleMemory(const SimpleMemoryParams &p); DrainState drain() override; @@ -187,6 +186,8 @@ class SimpleMemory : public AbstractMemory PortID idx=InvalidPortID) override; void init() override; + double getBW() { return bandwidth; } + protected: Tick recvAtomic(PacketPtr pkt); Tick recvAtomicBackdoor(PacketPtr pkt, MemBackdoorPtr &_backdoor); diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc index 8ab859f40d..bc698c1a07 100644 --- a/src/mem/translating_port_proxy.cc +++ b/src/mem/translating_port_proxy.cc @@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen, } bool -TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const +TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const { constexpr auto mode = BaseMMU::Read; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const bool TranslatingPortProxy::tryWriteBlob( - Addr addr, const void *p, int size) const + Addr addr, const void *p, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob( } bool -TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const +TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh index bedb57a3ce..7e619784b1 100644 --- a/src/mem/translating_port_proxy.hh +++ b/src/mem/translating_port_proxy.hh @@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy /** Version of tryReadblob that translates virt->phys and deals * with page boundries. */ - bool tryReadBlob(Addr addr, void *p, int size) const override; + bool tryReadBlob(Addr addr, void *p, Addr size) const override; /** Version of tryWriteBlob that translates virt->phys and deals * with page boundries. */ - bool tryWriteBlob(Addr addr, const void *p, int size) const override; + bool tryWriteBlob(Addr addr, const void *p, Addr size) const override; /** * Fill size bytes starting at addr with byte value val. */ - bool tryMemsetBlob(Addr address, uint8_t v, int size) const override; + bool tryMemsetBlob(Addr address, uint8_t v, Addr size) const override; }; } // namespace gem5 diff --git a/src/python/gem5/components/memory/hbm.py b/src/python/gem5/components/memory/hbm.py index 35497c2f89..75db1f9fde 100644 --- a/src/python/gem5/components/memory/hbm.py +++ b/src/python/gem5/components/memory/hbm.py @@ -122,7 +122,6 @@ def _interleave_addresses(self): # for interleaving across pseudo channels (at 64B currently) mask_list.insert(0, 1 << 6) for i, ctrl in enumerate(self.mem_ctrl): - ctrl.partitioned_q = False ctrl.dram.range = AddrRange( start=self._mem_range.start, size=self._mem_range.size(),