diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp index 6bacf2f15e..b9412aba95 100644 --- a/compiler/AST/build.cpp +++ b/compiler/AST/build.cpp @@ -1609,12 +1609,21 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices, beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL)); addByrefVars(beginBlk, byref_vars); beginBlk->insertAtHead(body); +#ifndef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount)); +#else + beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount)); +#endif BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered); block->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue))); block->insertAtHead(new DefExpr(coforallCount)); +#ifndef TARGET_HSA beginBlk->insertBefore(new CallExpr("_upEndCount", coforallCount)); block->insertAtTail(new CallExpr("_waitEndCount", coforallCount)); +#else + beginBlk->insertBefore(new CallExpr("_initTaskGroup", coforallCount)); + block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount)); +#endif block->insertAtTail(new CallExpr("_endCountFree", coforallCount)); nonVectorCoforallBlk->insertAtTail(block); } @@ -1624,15 +1633,27 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices, beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL)); addByrefVars(beginBlk, byref_vars); beginBlk->insertAtHead(body->copy()); +#ifndef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount)); +#else + beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount)); +#endif VarSymbol* numTasks = newTemp("numTasks"); vectorCoforallBlk->insertAtTail(new DefExpr(numTasks)); vectorCoforallBlk->insertAtTail(new CallExpr(PRIM_MOVE, numTasks, new CallExpr(".", tmpIter, new_CStringSymbol("size")))); +#ifndef TARGET_HSA vectorCoforallBlk->insertAtTail(new CallExpr("_upEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#else + vectorCoforallBlk->insertAtTail(new CallExpr("_initTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#endif BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered); vectorCoforallBlk->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue))); vectorCoforallBlk->insertAtHead(new DefExpr(coforallCount)); +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#else + block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks)); +#endif block->insertAtTail(new CallExpr("_endCountFree", coforallCount)); vectorCoforallBlk->insertAtTail(block); } @@ -2890,12 +2911,20 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) { return body; } else { BlockStmt* block = buildChapelStmt(); +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_upEndCount")); +#else + block->insertAtTail(new CallExpr("_addToTaskGroup")); +#endif BlockStmt* beginBlock = new BlockStmt(); beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN)); addByrefVars(beginBlock, byref_vars); beginBlock->insertAtHead(stmt); +#ifndef TARGET_HSA beginBlock->insertAtTail(new CallExpr("_downEndCount")); +#else + beginBlock->insertAtTail(new CallExpr("_completeTaskGroup")); +#endif block->insertAtTail(beginBlock); return block; } @@ -2911,7 +2940,11 @@ buildSyncStmt(Expr* stmt) { block->insertAtTail(new CallExpr(PRIM_MOVE, endCountSave, new CallExpr(PRIM_GET_END_COUNT))); block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gFalse))); block->insertAtTail(stmt); +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount")); +#else + block->insertAtTail(new CallExpr("_finalizeTaskGroup")); +#endif block->insertAtTail(new CallExpr("_endCountFree", new CallExpr(PRIM_GET_END_COUNT))); block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, endCountSave)); return block; @@ -2947,13 +2980,24 @@ buildCobeginStmt(CallExpr* byref_vars, BlockStmt* block) { addByrefVars(beginBlk, byref_vars ? byref_vars->copy() : NULL); stmt->insertBefore(beginBlk); beginBlk->insertAtHead(stmt->remove()); +#ifndef TARGET_HSA beginBlk->insertAtTail(new CallExpr("_downEndCount", cobeginCount)); block->insertAtHead(new CallExpr("_upEndCount", cobeginCount)); +#else + beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", cobeginCount)); +#endif } +#ifdef TARGET_HSA + block->insertAtHead(new CallExpr("_initTaskGroup", cobeginCount)); +#endif block->insertAtHead(new CallExpr(PRIM_MOVE, cobeginCount, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gTrue))); block->insertAtHead(new DefExpr(cobeginCount)); +#ifndef TARGET_HSA block->insertAtTail(new CallExpr("_waitEndCount", cobeginCount)); +#else + block->insertAtTail(new CallExpr("_finalizeTaskGroup", cobeginCount)); +#endif block->insertAtTail(new CallExpr("_endCountFree", cobeginCount)); block->astloc = cobeginCount->astloc; // grab the location of 'cobegin' kw diff --git a/compiler/AST/expr.cpp b/compiler/AST/expr.cpp index a651651883..feb9b73209 100644 --- a/compiler/AST/expr.cpp +++ b/compiler/AST/expr.cpp @@ -793,7 +793,8 @@ CallExpr::CallExpr(BaseAST* base, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(NULL), baseExpr(NULL), @@ -815,6 +816,7 @@ CallExpr::CallExpr(BaseAST* base, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; @@ -827,7 +829,8 @@ CallExpr::CallExpr(PrimitiveOp* prim, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(prim), baseExpr(NULL), @@ -840,6 +843,7 @@ CallExpr::CallExpr(PrimitiveOp* prim, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; @@ -851,7 +855,8 @@ CallExpr::CallExpr(PrimitiveTag prim, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(primitives[prim]), baseExpr(NULL), @@ -864,6 +869,7 @@ CallExpr::CallExpr(PrimitiveTag prim, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; @@ -876,7 +882,8 @@ CallExpr::CallExpr(const char* name, BaseAST* arg2, BaseAST* arg3, BaseAST* arg4, - BaseAST* arg5) : + BaseAST* arg5, + BaseAST* arg6) : Expr(E_CallExpr), primitive(NULL), baseExpr(new UnresolvedSymExpr(name)), @@ -889,6 +896,7 @@ CallExpr::CallExpr(const char* name, callExprHelper(this, arg3); callExprHelper(this, arg4); callExprHelper(this, arg5); + callExprHelper(this, arg6); argList.parent = this; diff --git a/compiler/codegen/expr.cpp b/compiler/codegen/expr.cpp index c3f0fd583a..1fee3d395e 100644 --- a/compiler/codegen/expr.cpp +++ b/compiler/codegen/expr.cpp @@ -5350,11 +5350,12 @@ void CallExpr::codegenInvokeOnFun() { void CallExpr::codegenInvokeTaskFun(const char* name) { FnSymbol* fn = isResolved(); GenRet taskList = codegenValue(get(1)); + GenRet taskGroup = codegenValue(get(6)); GenRet taskListNode; GenRet taskBundle; GenRet bundleSize; - std::vector args(8); + std::vector args(9); // get(1) is a ref/wide ref to a task list value // get(2) is the node ID owning the task list @@ -5374,9 +5375,10 @@ void CallExpr::codegenInvokeTaskFun(const char* name) { args[2] = codegenCast("chpl_task_bundle_p", taskBundle); args[3] = bundleSize; args[4] = taskList; - args[5] = codegenValue(taskListNode); - args[6] = fn->linenum(); - args[7] = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32); + args[5] = taskGroup; + args[6] = codegenValue(taskListNode); + args[7] = fn->linenum(); + args[8] = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32); genComment(fn->cname, true); diff --git a/compiler/include/expr.h b/compiler/include/expr.h index a26acaafed..5233984569 100644 --- a/compiler/include/expr.h +++ b/compiler/include/expr.h @@ -199,28 +199,32 @@ class CallExpr : public Expr { BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); CallExpr(PrimitiveOp* prim, BaseAST* arg1 = NULL, BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); CallExpr(PrimitiveTag prim, BaseAST* arg1 = NULL, BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); CallExpr(const char* name, BaseAST* arg1 = NULL, BaseAST* arg2 = NULL, BaseAST* arg3 = NULL, BaseAST* arg4 = NULL, - BaseAST* arg5 = NULL); + BaseAST* arg5 = NULL, + BaseAST* arg6 = NULL); ~CallExpr(); diff --git a/compiler/passes/buildDefaultFunctions.cpp b/compiler/passes/buildDefaultFunctions.cpp index 6ffa649baf..d8e6cff7c3 100644 --- a/compiler/passes/buildDefaultFunctions.cpp +++ b/compiler/passes/buildDefaultFunctions.cpp @@ -581,7 +581,11 @@ static void build_chpl_entry_points() { // endcount (see comment above) // if (fMinimalModules == false) { +#ifndef TARGET_HSA chpl_gen_main->insertAtTail(new CallExpr("_waitEndCount")); +#else + chpl_gen_main->insertAtTail(new CallExpr("_finalizeTaskGroup")); +#endif } chpl_gen_main->insertAtTail(new CallExpr(PRIM_RETURN, main_ret)); diff --git a/compiler/passes/insertLineNumbers.cpp b/compiler/passes/insertLineNumbers.cpp index 1c4d34beae..a9e302366d 100644 --- a/compiler/passes/insertLineNumbers.cpp +++ b/compiler/passes/insertLineNumbers.cpp @@ -325,8 +325,8 @@ static void moveLinenoInsideArgBundle() // than the expected number. Both block types below expect an // argument bundle, and the on-block expects an additional argument // that is the locale on which it should be executed. - if ((fn->numFormals() > 4 && fn->hasFlag(FLAG_ON_BLOCK)) || - (fn->numFormals() > 5 && !fn->hasFlag(FLAG_ON_BLOCK) && + if ((fn->numFormals() > 5 && fn->hasFlag(FLAG_ON_BLOCK)) || + (fn->numFormals() > 6 && !fn->hasFlag(FLAG_ON_BLOCK) && (fn->hasFlag(FLAG_BEGIN_BLOCK) || fn->hasFlag(FLAG_COBEGIN_OR_COFORALL_BLOCK)))) { diff --git a/compiler/passes/parallel.cpp b/compiler/passes/parallel.cpp index 78ef16120e..e88151bc20 100644 --- a/compiler/passes/parallel.cpp +++ b/compiler/passes/parallel.cpp @@ -99,7 +99,7 @@ static BundleArgsFnData bundleArgsFnDataInit = { true, NULL, NULL }; static void insertEndCounts(); static void passArgsToNestedFns(); static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnData &baData); -static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode); +static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup); static void findBlockRefActuals(Vec& refSet, Vec& refVec); static void findHeapVarsAndRefs(Map*>& defMap, Vec& refSet, Vec& refVec, @@ -410,6 +410,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) { // first argument to a task launch function. Symbol* endCount = NULL; VarSymbol *taskList = NULL; + VarSymbol *taskGroup = NULL; VarSymbol *taskListNode = NULL; if (!fn->hasFlag(FLAG_ON)) { @@ -466,6 +467,16 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) { endCount, endCount->typeInfo()->getField("taskList")))); + // Now get the taskGroup field out of the end count. + + taskGroup = newTemp(astr("_taskGroup", fn->name), dtCVoidPtr); + + fcall->insertBefore(new DefExpr(taskGroup)); + fcall->insertBefore(new CallExpr(PRIM_MOVE, taskGroup, + new CallExpr(PRIM_GET_MEMBER, + endCount, + endCount->typeInfo()->getField("taskGroup")))); + // Now get the node ID field for the end count, // which is where the task list is stored. @@ -481,7 +492,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) { create_block_fn_wrapper(fn, fcall, baData); // call wrapper-function - call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode); + call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode, taskGroup); baData.firstCall = false; } @@ -492,7 +503,8 @@ static CallExpr* helpFindDownEndCount(BlockStmt* block) while (cur && (isCallExpr(cur) || isDefExpr(cur) || isBlockStmt(cur))) { if (CallExpr* call = toCallExpr(cur)) { if (call->isResolved()) - if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0) + if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0 || + strcmp(call->resolvedFunction()->name, "_completeTaskGroup") == 0) return call; } else if (BlockStmt* inner = toBlockStmt(cur)) { // Need to handle local blocks since the compiler @@ -646,6 +658,10 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD dtCVoidPtr->refType ); taskListArg->addFlag(FLAG_NO_CODEGEN); wrap_fn->insertFormalAtTail(taskListArg); + ArgSymbol *taskGroupArg = new ArgSymbol( INTENT_IN, "dummy_taskGroup", + dtCVoidPtr->refType ); + taskGroupArg->addFlag(FLAG_NO_CODEGEN); + wrap_fn->insertFormalAtTail(taskGroupArg); ArgSymbol *taskListNode = new ArgSymbol( INTENT_IN, "dummy_taskListNode", dtInt[INT_SIZE_DEFAULT]); taskListNode->addFlag(FLAG_NO_CODEGEN); @@ -749,20 +765,20 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, - Symbol* taskList, Symbol* taskListNode) + Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup) { // The wrapper function is called with the bundled argument list. if (fn->hasFlag(FLAG_ON)) { // For an on block, the first argument is also passed directly // to the wrapper function. // The forking function uses this to fork a task on the target locale. - fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc)); + fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc));//, new SymExpr(taskGroup))); } else { // For non-on blocks, the task list is passed directly to the function // (so that codegen can find it). // We need the taskList. INT_ASSERT(taskList); - fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc)); + fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc, new SymExpr(taskGroup))); } fcall->remove(); // rm orig. call diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa index fa5115fa79..0fc1c3d234 100644 --- a/make/compiler/Makefile.hsa +++ b/make/compiler/Makefile.hsa @@ -17,11 +17,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu ifdef CHPL_ROCM # ROCm locations CLOC=/opt/rocm/cloc/bin/cloc.sh -LIBS+=-lhsa-runtime64 -lhsakmt -lm +LIBS+=-latmi_runtime -lm # TODO: move these in third-party directory? -GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -HSA_INCLUDES=-I/opt/rocm/hsa/include +GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/atmi/lib +HSA_INCLUDES=-I/opt/rocm/atmi/include else # HSA locations CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh diff --git a/make/tasks/Makefile.atmi b/make/tasks/Makefile.atmi new file mode 100644 index 0000000000..00ca53fb62 --- /dev/null +++ b/make/tasks/Makefile.atmi @@ -0,0 +1,19 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_INCLS += -I$(QTHREAD_INCLUDE_DIR) +CHPL_MAKE_THREADS=none diff --git a/modules/internal/ChapelBase.chpl b/modules/internal/ChapelBase.chpl index fcdb87690c..ba6210d01b 100644 --- a/modules/internal/ChapelBase.chpl +++ b/modules/internal/ChapelBase.chpl @@ -763,7 +763,8 @@ module ChapelBase { type taskType; var i: iType, taskCnt: taskType, - taskList: c_void_ptr = _defaultOf(c_void_ptr); + taskList: c_void_ptr = _defaultOf(c_void_ptr), + taskGroup: c_void_ptr = _defaultOf(c_void_ptr); } // This function is called once by the initiating task. No on @@ -786,10 +787,17 @@ module ChapelBase { } } + // Compiler looks for this variable to determine the return type of // the "get end count" primitive. type _remoteEndCountType = _endCountAlloc(false).type; + pragma "insert line file info" + extern proc chpl_taskGroupInit(): c_void_ptr; + extern proc chpl_taskGroupGet(): c_void_ptr; + extern proc chpl_taskGroupComplete(); + extern proc chpl_taskGroupFinalize(task_group: c_void_ptr); + // This function is called once by the initiating task. As above, no // on statement needed. pragma "dont disable remote value forwarding" @@ -797,6 +805,36 @@ module ChapelBase { delete e; } + // This function is called by the initiating task once for each new + // task *before* any of the tasks are started. As above, no on + // statement needed. + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true) { + e.taskGroup = chpl_taskGroupGet(); + } + + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) { + e.taskGroup = chpl_taskGroupGet(); + } + + // This function is called by the initiating task once for each new + // task *before* any of the tasks are started. As above, no on + // statement needed. + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _initTaskGroup(e: _EndCount, param countRunningTasks=true) { + e.taskGroup = chpl_taskGroupInit(); + } + + pragma "dont disable remote value forwarding" + pragma "no remote memory fence" + proc _initTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) { + e.taskGroup = chpl_taskGroupInit(); + } + // This function is called by the initiating task once for each new // task *before* any of the tasks are started. As above, no on // statement needed. @@ -838,6 +876,26 @@ module ChapelBase { e.i.sub(1, memory_order_release); } + // This function is called once by each newly initiated task. No on + // statement is needed because the call to sub() will do a remote + // fork (on) if needed. + pragma "dont disable remote value forwarding" + proc _completeTaskGroup(e: _EndCount) { + chpl_taskGroupComplete(); + } + + // This function is called once by the initiating task. As above, no + // on statement needed. + pragma "dont disable remote value forwarding" + proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true) { + chpl_taskGroupFinalize(e.taskGroup); + } + + pragma "dont disable remote value forwarding" + proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) { + chpl_taskGroupFinalize(e.taskGroup); + } + // This function is called once by the initiating task. As above, no // on statement needed. pragma "dont disable remote value forwarding" @@ -867,7 +925,7 @@ module ChapelBase { proc _waitEndCount(e: _EndCount, param countRunningTasks=true, numTasks) { // See if we can help with any of the started tasks chpl_taskListExecute(e.taskList); - + // Wait for all tasks to finish e.i.waitFor(0, memory_order_acquire); @@ -876,6 +934,26 @@ module ChapelBase { } } + proc _addToTaskGroup(param countRunningTasks=true) { + var e = __primitive("get end count"); + _addToTaskGroup(e, countRunningTasks); + } + + proc _initTaskGroup(param countRunningTasks=true) { + var e = __primitive("get end count"); + _initTaskGroup(e, countRunningTasks); + } + + proc _completeTaskGroup() { + var e = __primitive("get end count"); + _completeTaskGroup(e); + } + + proc _finalizeTaskGroup(param countRunningTasks=true) { + var e = __primitive("get end count"); + _finalizeTaskGroup(e, countRunningTasks); + } + proc _upEndCount(param countRunningTasks=true) { var e = __primitive("get end count"); _upEndCount(e, countRunningTasks); diff --git a/modules/internal/LocaleModelHelpRuntime.chpl b/modules/internal/LocaleModelHelpRuntime.chpl index 1eca09e9a9..33550f1ae2 100644 --- a/modules/internal/LocaleModelHelpRuntime.chpl +++ b/modules/internal/LocaleModelHelpRuntime.chpl @@ -118,7 +118,7 @@ module LocaleModelHelpRuntime { args: chpl_task_bundle_p, args_size: size_t, subloc_id: int, ref tlist: c_void_ptr, tlist_node_id: int, - is_begin: bool); + is_begin: bool, tgroup: c_void_ptr); extern proc chpl_task_executeTasksInList(ref tlist: c_void_ptr); // @@ -131,10 +131,12 @@ module LocaleModelHelpRuntime { args: chpl_task_bundle_p, // function args args_size: size_t, // args size ref tlist: c_void_ptr, // task list + tgroup: c_void_ptr, // task group tlist_node_id: int // task list owner node ) { + warning("begin fn : ", fn); chpl_task_addToTaskList(fn, args, args_size, - subloc_id, tlist, tlist_node_id, true); + subloc_id, tlist, tlist_node_id, true, tgroup); } // @@ -148,10 +150,12 @@ module LocaleModelHelpRuntime { args: chpl_task_bundle_p, // function args args_size: size_t, // args size ref tlist: c_void_ptr, // task list + tgroup: c_void_ptr, // task group tlist_node_id: int // task list owner node ) { + warning("cobegin fn : ", fn); chpl_task_addToTaskList(fn, args, args_size, - subloc_id, tlist, tlist_node_id, false); + subloc_id, tlist, tlist_node_id, false, tgroup); } // diff --git a/modules/internal/LocaleModelHelpSetup.chpl b/modules/internal/LocaleModelHelpSetup.chpl index 69c3562ec1..93ab89f26d 100644 --- a/modules/internal/LocaleModelHelpSetup.chpl +++ b/modules/internal/LocaleModelHelpSetup.chpl @@ -172,9 +172,6 @@ module LocaleModelHelpSetup { helpSetupLocaleFlat(dst, local_name); - extern proc chpl_task_getNumSublocales(): int(32); - numSublocales = chpl_task_getNumSublocales(); - extern proc chpl_task_getMaxPar(): uint(32); @@ -191,7 +188,8 @@ module LocaleModelHelpSetup { then local_name = chpl_nodeName():string + "-" + _node_id : string; else local_name = chpl_nodeName():string; - numSublocales = 2; + extern proc chpl_task_getNumSublocales(): int(32); + numSublocales = chpl_task_getNumSublocales(); const origSubloc = chpl_task_getRequestedSubloc(); diff --git a/modules/internal/localeModels/hsa/LocaleModel.chpl b/modules/internal/localeModels/hsa/LocaleModel.chpl index 8c54136b3e..627cfd0d19 100644 --- a/modules/internal/localeModels/hsa/LocaleModel.chpl +++ b/modules/internal/localeModels/hsa/LocaleModel.chpl @@ -157,7 +157,7 @@ module LocaleModel { return {0..#numSublocales}; } - proc getChildCount() return 0; + proc getChildCount() return numSublocales; iter getChildIndices() : int { for idx in {0..#numSublocales} do // chpl_emptyLocaleSpace do diff --git a/runtime/Makefile.help b/runtime/Makefile.help index 00e96f04c4..a6d84c1ff5 100644 --- a/runtime/Makefile.help +++ b/runtime/Makefile.help @@ -168,4 +168,7 @@ include $(RUNTIME_ROOT)/make/Makefile.runtime.foot src/tasks/qthreads/Makefile.include: cd src/tasks/qthreads && $(MAKE) copy_qthread_files +src/tasks/atmi/Makefile.include: + cd src/tasks/atmi && $(MAKE) copy_qthread_files + .NOTPARALLEL: diff --git a/runtime/etc/Makefile.tasks-atmi b/runtime/etc/Makefile.tasks-atmi new file mode 100644 index 0000000000..ce328fd06b --- /dev/null +++ b/runtime/etc/Makefile.tasks-atmi @@ -0,0 +1,19 @@ +# Copyright 2004-2015 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +GEN_CFLAGS += -I$(QTHREAD_INCLUDE_DIR) +GEN_LFLAGS += -L$(QTHREAD_LIB_DIR) -Wl,-rpath,$(QTHREAD_LIB_DIR) diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h new file mode 100644 index 0000000000..82c8a54780 --- /dev/null +++ b/runtime/include/chpl-atmi.h @@ -0,0 +1,34 @@ +#ifndef _chpl_atmi_h_ +#define _chpl_atmi_h_ + +#include +#include /* size_t */ +#include /* uintXX_t */ +#ifndef __cplusplus +#include +#endif /* __cplusplus */ + +#include "chpltypes.h" +#include "chpl-hsa-kernelparams.h" + +atmi_kernel_t reduction_kernel; +atmi_kernel_t *gpu_kernels; +atmi_kernel_t main_kernel; +int g_num_cpu_kernels; + +atmi_machine_t *g_machine; + +enum { + GPU_KERNEL_IMPL = 10565, + REDUCTION_GPU_IMPL = 42, + CPU_FUNCTION_IMPL = 43 +}; + +int chpl_hsa_initialize(void); + +int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count); +int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count); + +void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, + uint32_t wkitem_count_x, void *bundled_args); +#endif //_chpl_atmi_h_ diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h index 4527659c5d..2f414baa78 100644 --- a/runtime/include/chpl-gen-includes.h +++ b/runtime/include/chpl-gen-includes.h @@ -31,7 +31,7 @@ #include "chpl-tasks.h" #include "chpltypes.h" #ifdef TARGET_HSA -#include "chpl-hsa.h" +#include "chpl-atmi.h" #endif // diff --git a/runtime/include/chpl-tasks.h b/runtime/include/chpl-tasks.h index 6eac21fb8c..d9fb916627 100644 --- a/runtime/include/chpl-tasks.h +++ b/runtime/include/chpl-tasks.h @@ -158,10 +158,15 @@ void chpl_task_addToTaskList( void**, // task list c_nodeid_t, // locale (node) where task list resides chpl_bool, // is begin{} stmt? (vs. cobegin or coforall) + void *, // task group int, // line at which function begins int32_t); // name of file containing function void chpl_task_executeTasksInList(void**); +void *chpl_taskGroupInit(int lineno, int32_t filename); +void *chpl_taskGroupGet(); +void chpl_taskGroupFinalize(void *task_group); +void chpl_taskGroupComplete(); // // Call a chpl_ftable[] function in a task. // diff --git a/runtime/include/chpltypes.h b/runtime/include/chpltypes.h index 0e6b4644df..439bef780b 100644 --- a/runtime/include/chpltypes.h +++ b/runtime/include/chpltypes.h @@ -189,6 +189,7 @@ typedef int32_t chpl_bool32; typedef int64_t chpl_bool64; typedef void (*chpl_fn_p)(void*); // function pointer for runtime ftable +typedef const char *chpl_fn_name; typedef int16_t chpl_fn_int_t; // int type for ftable indexing // Function table names and information, for VisualDebug use diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h new file mode 100644 index 0000000000..3f1497b61d --- /dev/null +++ b/runtime/include/tasks/atmi/tasks-atmi.h @@ -0,0 +1,218 @@ +/************************************************************************** +* Copyright 2011 Sandia Corporation. Under the terms of Contract +* DE-AC04-94AL85000, there is a non-exclusive license for use of this work by +* or on behalf of the U.S. Government. Export of this program may require a +* license from the United States Government +**************************************************************************/ + +/* + * Copyright 2004-2017 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _tasks_qthreads_h_ +#define _tasks_qthreads_h_ + +#include "chpl-atmi.h" +#include "chpl-tasks-prvdata.h" +#include "chpltypes.h" +#include "chpl-mem.h" +#include "qthread.h" +#include "qthread-chapel.h" + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define CHPL_COMM_YIELD_TASK_WHILE_POLLING +void chpl_task_yield(void); + +// +// Type (and default value) used to communicate task identifiers +// between C code and Chapel code in the runtime. +// +typedef uint64_t chpl_taskID_t; +#define chpl_nullTaskID ATMI_TASK_HANDLE(0xFFFFFFFFFFFFFFFF) + +// +// Sync variables +// +typedef struct { + aligned_t lockers_in; + aligned_t lockers_out; + uint_fast32_t uncontested_locks; + int is_full; + syncvar_t signal_full; + syncvar_t signal_empty; +} chpl_sync_aux_t; + +// +// Task private data +// + +extern +#ifdef __cplusplus +"C" +#endif + +volatile int chpl_qthread_done_initializing; + +// +// Task layer private area argument bundle header +// +typedef struct { + chpl_bool serial_state; + chpl_bool countRunning; + chpl_bool is_executeOn; + int lineno; + int filename; + c_sublocid_t requestedSubloc; + chpl_fn_int_t requested_fid; + chpl_fn_p requested_fn; + chpl_fn_name requested_fn_name; + chpl_taskID_t id; +} chpl_task_bundle_t; + +// Structure of task-local storage +typedef struct chpl_atmi_tls_s { + chpl_task_bundle_t *bundle; + // The below fields could move to chpl_task_bundleData_t + // That would reduce the size of the task local storage, + // but increase the size of executeOn bundles. + chpl_task_prvData_t prvdata; + /* Reports */ + int lock_filename; + int lock_lineno; +} chpl_atmi_tls_t; + +extern pthread_key_t tls_cache; + +extern pthread_t chpl_qthread_process_pthread; +extern pthread_t chpl_qthread_comm_pthread; + +extern chpl_atmi_tls_t chpl_qthread_process_tls; +extern chpl_atmi_tls_t chpl_qthread_comm_task_tls; + +#define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized +void chpl_task_stdModulesInitialized(void); + +extern pthread_t null_thread; + +// Wrap qthread_get_tasklocal() and assert that it is always available. +extern chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void); + +#ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL +#error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GET_PRVDATA_IMPL_DECL 1 +#endif +static inline chpl_task_prvData_t* chpl_task_getPrvData(void) +{ + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) { + return &data->prvdata; + } + assert(data); + return NULL; +} + +// +// Sublocale support +// +#ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL +#error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GETSUBLOC_IMPL_DECL 1 +#endif +static inline +c_sublocid_t chpl_task_getSubloc(void) +{ + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) + return data->bundle->requestedSubloc; + else + return c_sublocid_any; +} + +#ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL +#error "CHPL_TASK_SETSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_SETSUBLOC_IMPL_DECL 1 +#endif +static inline +void chpl_task_setSubloc(c_sublocid_t subloc) +{ + assert(subloc != c_sublocid_none); + + // Only change sublocales if the caller asked for a particular one, + // which is not the current one, and we're a (movable) task. + // + // Note: It's likely that this won't work in all cases where we need + // it. In particular, we envision needing to move execution + // from sublocale to sublocale while initializing the memory + // layer, in order to get the NUMA domain affinity right for + // the subparts of the heap. But this will be happening well + // before tasking init and in any case would be done from the + // main thread of execution, which doesn't have a shepherd. + // The code below wouldn't work in that situation. + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) { + data->bundle->requestedSubloc = subloc; + printf("Setting ATMI requested subloc to %d\n", subloc); + } +} + +#ifdef CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL +#error "CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL 1 +#endif +static inline +c_sublocid_t chpl_task_getRequestedSubloc(void) +{ + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + if (data) { + return data->bundle->requestedSubloc; + } + + return c_sublocid_any; +} + +// +// Can we support remote caching? +// +#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL +#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!" +#else +#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1 +#endif +static inline +int chpl_task_supportsRemoteCache(void) { + return CHPL_QTHREAD_SUPPORTS_REMOTE_CACHE; +} + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // ifndef _tasks_qthreads_h_ +/* vim:set expandtab: */ diff --git a/runtime/src/Makefile.share b/runtime/src/Makefile.share index 732e06ad1f..8d93b13d3a 100644 --- a/runtime/src/Makefile.share +++ b/runtime/src/Makefile.share @@ -18,9 +18,9 @@ ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa) HSA_SRCS = \ - chpl-hsa.c \ + chpl-atmi.c \ chpl-hsa-reducekernels.cl \ - chpl-hsa-reducehost.c + chpl-atmi-reducehost.c endif COMMON_LAUNCHER_SRCS = \ diff --git a/runtime/src/chpl-atmi-reducehost.c b/runtime/src/chpl-atmi-reducehost.c new file mode 100644 index 0000000000..3d642a7f22 --- /dev/null +++ b/runtime/src/chpl-atmi-reducehost.c @@ -0,0 +1,136 @@ + +#include "chpl-atmi.h" +#include "chplrt.h" +#include "chplexit.h" +#include "chpl-mem.h" + +/*enum ReduceOp { + MAX, + MIN, + SUM, + PROD, + BITAND, + BITOR, + BITXOR, + LOGAND, + LOGOR + }; + */ + +/* + * Estimate and schedule the required number of GPU kernels + */ + static inline +void atmi_sched_reducekernels(size_t count, + void *darray[2], size_t *iter_ct, + size_t *items_left) +{ + size_t incount, outcount, i, iter, in, out; + uint32_t max_num_wkgrps, num_wkgroups, grid_size_x; + + const int num_args = 3; + atmi_task_group_t task_group = {1, ATMI_TRUE}; + ATMI_LPARM(lparm); + lparm->group = &task_group; + lparm->kernel_id = REDUCTION_GPU_IMPL; + lparm->synchronous = ATMI_FALSE; + lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); + + incount = count; + max_num_wkgrps = incount / WKGRP_SIZE; + num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE; + grid_size_x = num_wkgroups * WKGRP_SIZE; + outcount = num_wkgroups; + iter = 0; + while (grid_size_x > WKGRP_SIZE) { + in = (iter & 1); + out = (iter & 1) ^ 1; + + void *args[] = {&darray[in], &darray[out], &incount}; + lparm->gridDim[0] = grid_size_x; + lparm->groupDim[0] = WKGRP_SIZE; + atmi_task_launch(lparm, reduction_kernel, args); + + iter += 1; + incount = outcount; + max_num_wkgrps = incount / WKGRP_SIZE; + num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE - 1) / SEQ_CHUNK_SIZE; + grid_size_x = num_wkgroups * WKGRP_SIZE; + outcount = num_wkgroups; + } + + if (iter > 0) { + atmi_task_group_sync(&task_group); + } + + (*items_left) = incount; + (*iter_ct) = iter; +} + +/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count) + { + int32_t res; + size_t iter, items_left, out, i; + int32_t * darray[2]; + hsa_symbol_info_t * symbol_info; + symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index + darray[0] = src; + if (0 != chpl_posix_memalign((void **) &darray[1], 64, + count * sizeof(int32_t))) { + chpl_exit_any(1); + } + + hsa_sched_reducekernels(count, symbol_info, (void**)darray, + &iter, &items_left); + + res = 0; + out = (iter & 1); + chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left); + for (i = 0; i < items_left; ++i) res += darray[out][i]; + + chpl_free (darray[1]); + return res; + }*/ + +int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count) +{ + int64_t res; + size_t iter, items_left, out, i; + int64_t * darray[2]; + darray[0] = src; + if (0 != chpl_posix_memalign((void **) &darray[1], 64, + count * sizeof(int64_t))) { + chpl_exit_any(1); + } + + atmi_sched_reducekernels(count, (void**)darray, + &iter, &items_left); + + res = 0; + out = (iter & 1); + chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left); + for (i = 0; i < items_left; ++i) res += darray[out][i]; + + chpl_free (darray[1]); + return res; +} + +//FIXME: use the op argument like this to extend this to different ops +/*if (!strcasecmp(op, "Max")) + opType = MAX; + else if (!strcasecmp(op, "Min")) + opType = MIN; + else if (!strcasecmp(op, "Sum")) + opType = SUM; + else if (!strcasecmp(op, "Product")) + opType = PROD; + else if (!strcasecmp(op, "LogicalAnd")) + opType = LOGAND; + else if (!strcasecmp(op, "LogicalOr")) + opType = LOGOR; + else if (!strcasecmp(op, "BitwiseAnd")) + opType = BITAND; + else if (!strcasecmp(op, "BitwiseOr")) + opType = BITOR; + else if (!strcasecmp(op, "BitwiseXor")) + opType = BITXOR; */ diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c new file mode 100644 index 0000000000..543db24ec1 --- /dev/null +++ b/runtime/src/chpl-atmi.c @@ -0,0 +1,113 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "chpl-atmi.h" +#include "chplrt.h" +#include "chpl-mem.h" +#include "chplcgfns.h" + +#define OUTPUT_ATMI_STATUS(status, msg) \ +{ \ + if (ATMI_STATUS_SUCCESS != (status)) { \ + fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \ +#msg, status); \ + atmi_finalize(); \ + return status; \ + } \ +} + +/** + * Initialize the ATMI/HSA runtime + */ +int chpl_hsa_initialize(void) +{ + char reduce_kernel_filename[1024]; + char gen_kernel_filename[1024]; + int arglen = strlen(chpl_executionCommand)+1; + char* argCopy = chpl_mem_allocMany(arglen, sizeof(char), + CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0); + char *binName; + int cx; + + cx = snprintf(reduce_kernel_filename, 1024, +#ifdef ROCM + "%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME, +#else + "%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME, +#endif + CHPL_RUNTIME_OBJDIR); + if (cx < 0 || cx >= 256) { + OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename); + } + strcpy(argCopy, chpl_executionCommand); + binName = strtok(argCopy, " "); +#ifdef ROCM + cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName); +#else + cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName); +#endif + if (cx < 0 || cx >= 256) { + OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename); + } + chpl_mem_free(argCopy, 0, 0); + +#ifdef ROCM + atmi_platform_type_t module_type = AMDGCN; +#else + atmi_platform_type_t module_type = BRIG; +#endif + + /* FIXME: Create all reduction kernels, not just the int64-sum kernel */ + const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename}; + atmi_platform_type_t module_types[2] = {module_type, module_type}; + atmi_status_t st = atmi_module_register(modules, module_types, 2); + OUTPUT_ATMI_STATUS(st, Registering all modules); + + size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)}; + const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]); + atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes); + atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL); + + size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; + const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]); + gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels); + for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) { + //FIXME: get the actual kernel name + const char *kernel_name = chpl_gpu_kernels[i]; + atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes); + atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL); + } + + return ATMI_STATUS_SUCCESS; +} + +/** + * Release resources used by the base kernels and tear down the HSA structures + */ +int hsa_shutdown(void) +{ + chpl_free(gpu_kernels); + atmi_finalize(); +} + +/* + * Enqueue/execute a kernel + */ +void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x, + uint32_t wkitem_count_x, void *bundled_args) +{ + void *args[] = {&bundled_args}; + ATMI_LPARM_1D(lparm, wkitem_count_x); + lparm->groupDim[0] = wkgrp_size_x; + lparm->synchronous = ATMI_TRUE; + + lparm->kernel_id = GPU_KERNEL_IMPL; + lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0); + atmi_task_launch(lparm, gpu_kernels[kernel_idx], args); +} + diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile new file mode 100644 index 0000000000..c78c8f6377 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile @@ -0,0 +1,42 @@ +# Copyright 2004-2017 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_ROOT = ../../.. +RUNTIME_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) + +ifndef CHPL_MAKE_HOME +export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/.. +endif + +# +# standard header +# +include $(RUNTIME_ROOT)/make/Makefile.runtime.head + +TASKS_OBJDIR = $(RUNTIME_OBJDIR) +include Makefile.share + +TARGETS = $(TASKS_OBJS) + +include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules + +FORCE: + +# +# standard footer +# +include $(RUNTIME_ROOT)/make/Makefile.runtime.foot diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include new file mode 100644 index 0000000000..35b1225bf7 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile.include @@ -0,0 +1,25 @@ +# Copyright 2004-2017 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS) + +TASKS_OBJDIR = $(RUNTIME_BUILD)/$(TASKS_SUBDIR) + +ALL_SRCS += $(CURDIR)/$(TASKS_SUBDIR)/*.c \ + $(RUNTIME_INCLUDE_ROOT)/tasks/$(CHPL_MAKE_TASKS)/*.h + +include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share new file mode 100644 index 0000000000..74c39c92a2 --- /dev/null +++ b/runtime/src/tasks/atmi/Makefile.share @@ -0,0 +1,23 @@ +# Copyright 2004-2017 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TASKS_SRCS = tasks-$(CHPL_MAKE_TASKS).c + +SVN_SRCS = $(TASKS_SRCS) +SRCS = $(SVN_SRCS) + +TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o) diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c new file mode 100644 index 0000000000..9194494b18 --- /dev/null +++ b/runtime/src/tasks/atmi/tasks-atmi.c @@ -0,0 +1,1316 @@ +// +// Qthreads implementation of Chapel tasking interface +// +// Copyright 2011 Sandia Corporation. Under the terms of Contract +// DE-AC04-94AL85000, there is a non-exclusive license for use of this work by +// or on behalf of the U.S. Government. Export of this program may require a +// license from the United States Government +// + +/* + * Copyright 2004-2017 Cray Inc. + * Other additional copyright holders may be indicated within. + * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// For SVID definitions (setenv) +#define _SVID_SOURCE + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "chplrt.h" + +#include "arg.h" +#include "error.h" +#include "chplcgfns.h" +#include "chpl-comm.h" +#include "chplexit.h" +#include "chpl-locale-model.h" +#include "chpl-mem.h" +#include "chplsys.h" +#include "chpl-linefile-support.h" +#include "chpl-tasks.h" +#include "chpl-atomics.h" +#include "chpl-tasks-callbacks-internal.h" +//#include "tasks-qthreads.h" +#include "tasks-atmi.h" +#include + +#include "qthread.h" +#include "qthread/qtimer.h" +#include "qthread-chapel.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// FIXME: Good idea to recycle the task groups, so this will limit +// the depth of nested coforall s and cobegins. Fix to use an ATMI +// environment variable. +#define max_num_task_groups 8 +#define max_num_cpu_kernels 4096 +int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; +atmi_kernel_t cpu_kernels[max_num_cpu_kernels]; +atmi_kernel_t dummy_kernel; +static uint_least64_t atmi_tg_id = 0; + + +#define OUTPUT_ATMI_STATUS(status, msg) \ +{ \ + if (ATMI_STATUS_SUCCESS != (status)) { \ + fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \ +#msg, status); \ + atmi_finalize(); \ + return status; \ + } \ +} + +//#define SUPPORT_BLOCKREPORT +//#define SUPPORT_TASKREPORT + +#ifdef CHAPEL_PROFILE +# define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0) + +/* Tasks */ +static aligned_t profile_task_yield = 0; +static aligned_t profile_task_addToTaskList = 0; +static aligned_t profile_task_executeTasksInList = 0; +static aligned_t profile_task_taskCallFTable = 0; +static aligned_t profile_task_startMovedTask = 0; +static aligned_t profile_task_getId = 0; +static aligned_t profile_task_sleep = 0; +static aligned_t profile_task_getSerial = 0; +static aligned_t profile_task_setSerial = 0; +static aligned_t profile_task_getCallStackSize = 0; +/* Sync */ +static aligned_t profile_sync_lock= 0; +static aligned_t profile_sync_unlock= 0; +static aligned_t profile_sync_waitFullAndLock= 0; +static aligned_t profile_sync_waitEmptyAndLock= 0; +static aligned_t profile_sync_markAndSignalFull= 0; +static aligned_t profile_sync_markAndSignalEmpty= 0; +static aligned_t profile_sync_isFull= 0; +static aligned_t profile_sync_initAux= 0; +static aligned_t profile_sync_destroyAux= 0; + +static void profile_print(void) +{ + /* Tasks */ + fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield); + fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList); + fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList); + fprintf(stderr, "task taskCallFTable: %lu\n", (unsigned long)profile_task_taskCallFTable); + fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask); + fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId); + fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep); + fprintf(stderr, "task getSerial: %lu\n", (unsigned long)profile_task_getSerial); + fprintf(stderr, "task setSerial: %lu\n", (unsigned long)profile_task_setSerial); + fprintf(stderr, "task getCallStackSize: %lu\n", (unsigned long)profile_task_getCallStackSize); + /* Sync */ + fprintf(stderr, "sync lock: %lu\n", (unsigned long)profile_sync_lock); + fprintf(stderr, "sync unlock: %lu\n", (unsigned long)profile_sync_unlock); + fprintf(stderr, "sync waitFullAndLock: %lu\n", (unsigned long)profile_sync_waitFullAndLock); + fprintf(stderr, "sync waitEmptyAndLock: %lu\n", (unsigned long)profile_sync_waitEmptyAndLock); + fprintf(stderr, "sync markAndSignalFull: %lu\n", (unsigned long)profile_sync_markAndSignalFull); + fprintf(stderr, "sync markAndSignalEmpty: %lu\n", (unsigned long)profile_sync_markAndSignalEmpty); + fprintf(stderr, "sync isFull: %lu\n", (unsigned long)profile_sync_isFull); + fprintf(stderr, "sync initAux: %lu\n", (unsigned long)profile_sync_initAux); + fprintf(stderr, "sync destroyAux: %lu\n", (unsigned long)profile_sync_destroyAux); +} +#else +# define PROFILE_INCR(counter,count) +#endif /* CHAPEL_PROFILE */ + +// +// Startup and shutdown control. The mutex is used just for the side +// effect of its (very portable) memory fence. +// +volatile int chpl_qthread_done_initializing; +static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER; +static volatile int done_finalizing; +static pthread_mutex_t done_init_final_mux = PTHREAD_MUTEX_INITIALIZER; + +// Make qt env sizes uniform. Same as qt, but they use the literal everywhere +#define QT_ENV_S 100 + +// aka chpl_task_list_p +struct chpl_task_list { + chpl_fn_p fun; + void *arg; + int32_t filename; + int lineno; + chpl_task_list_p next; +}; + +static aligned_t next_task_id = 1; + +pthread_t chpl_qthread_process_pthread; +pthread_t chpl_qthread_comm_pthread; + +chpl_task_bundle_t chpl_qthread_process_bundle = { + .serial_state = false, + .countRunning = false, + .is_executeOn = false, + .lineno = 0, + .filename = CHPL_FILE_IDX_MAIN_TASK, + .requestedSubloc = c_sublocid_any_val, + .requested_fid = FID_NONE, + .requested_fn = NULL, + .id = chpl_nullTaskID }; + +chpl_task_bundle_t chpl_qthread_comm_task_bundle = { + .serial_state = false, + .countRunning = false, + .is_executeOn = false, + .lineno = 0, + .filename = CHPL_FILE_IDX_COMM_TASK, + .requestedSubloc = c_sublocid_any_val, + .requested_fid = FID_NONE, + .requested_fn = NULL, + .id = chpl_nullTaskID }; + +chpl_atmi_tls_t chpl_qthread_process_tls = { + .bundle = &chpl_qthread_process_bundle, + .lock_filename = 0, + .lock_lineno = 0 }; + +chpl_atmi_tls_t chpl_qthread_comm_task_tls = { + .bundle = &chpl_qthread_comm_task_bundle, + .lock_filename = 0, + .lock_lineno = 0 }; + +// +// chpl_atmi_get_tasklocal() is in tasks-qthreads.h +// + +pthread_key_t tls_cache; + +static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; + +static volatile chpl_bool canCountRunningTasks = false; + +void chpl_task_yield(void) +{ + PROFILE_INCR(profile_task_yield,1); + /*if (qthread_shep() == NO_SHEPHERD) { + sched_yield(); + } else { + qthread_yield(); + }*/ +} + +// Sync variables +void chpl_sync_lock(chpl_sync_aux_t *s) +{ + aligned_t l; + chpl_bool uncontested_lock = true; + + PROFILE_INCR(profile_sync_lock, 1); + + // + // To prevent starvation due to never switching away from a task that is + // spinning while doing readXX() on a sync variable, yield if this sync var + // has a "lot" of uncontested locks. Note that the uncontested locks do not + // have to be consecutive. Also note that the number of uncontested locks + // is a lossy counter. Currently a "lot" is defined as ~100 uncontested + // locks, with care taken to not yield on the first uncontested lock. + // + // If real qthreads sync vars were used, it's possible this wouldn't be + // needed. + // + + l = qthread_incr(&s->lockers_in, 1); + + while (l != s->lockers_out) { + uncontested_lock = false; + qthread_yield(); + } + + if (uncontested_lock) { + if ((++s->uncontested_locks & 0x5F) == 0) { + qthread_yield(); + } + } +} + +void chpl_sync_unlock(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_unlock, 1); + + qthread_incr(&s->lockers_out, 1); +} + +inline chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void) +{ + chpl_atmi_tls_t* tls = NULL; + + pthread_t me = pthread_self(); + if (pthread_equal(me, chpl_qthread_comm_pthread)) + tls = &chpl_qthread_comm_task_tls; + else if (pthread_equal(me, chpl_qthread_process_pthread)) + tls = &chpl_qthread_process_tls; + else { + atmi_task_handle_t t = get_atmi_task_handle(); + if(t != ATMI_NULL_TASK_HANDLE) { + tls = (chpl_atmi_tls_t *)pthread_getspecific(tls_cache); + if(tls == NULL) { + // FIXME: when to free? + tls = (chpl_atmi_tls_t *)chpl_mem_alloc(sizeof(chpl_atmi_tls_t), + CHPL_RT_MD_THREAD_PRV_DATA, + 0, 0); + pthread_setspecific(tls_cache, tls); + } + } + assert(tls != NULL); + } + + return tls; +} + +static inline void about_to_block(int32_t lineno, + int32_t filename) +{ + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + assert(data); + + data->lock_lineno = lineno; + data->lock_filename = filename; +} + +void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s, + int32_t lineno, + int32_t filename) +{ + PROFILE_INCR(profile_sync_waitFullAndLock, 1); + + if (blockreport) { about_to_block(lineno, filename); } + chpl_sync_lock(s); + while (s->is_full == 0) { + chpl_sync_unlock(s); + qthread_syncvar_readFE(NULL, &(s->signal_full)); + chpl_sync_lock(s); + } +} + +void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s, + int32_t lineno, + int32_t filename) +{ + PROFILE_INCR(profile_sync_waitEmptyAndLock, 1); + + if (blockreport) { about_to_block(lineno, filename); } + chpl_sync_lock(s); + while (s->is_full != 0) { + chpl_sync_unlock(s); + qthread_syncvar_readFE(NULL, &(s->signal_empty)); + chpl_sync_lock(s); + } +} + +void chpl_sync_markAndSignalFull(chpl_sync_aux_t *s) // and unlock +{ + PROFILE_INCR(profile_sync_markAndSignalFull, 1); + + qthread_syncvar_fill(&(s->signal_full)); + s->is_full = 1; + chpl_sync_unlock(s); +} + +void chpl_sync_markAndSignalEmpty(chpl_sync_aux_t *s) // and unlock +{ + PROFILE_INCR(profile_sync_markAndSignalEmpty, 1); + + qthread_syncvar_fill(&(s->signal_empty)); + s->is_full = 0; + chpl_sync_unlock(s); +} + +chpl_bool chpl_sync_isFull(void *val_ptr, + chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_isFull, 1); + + return s->is_full; +} + +void chpl_sync_initAux(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_initAux, 1); + + s->lockers_in = 0; + s->lockers_out = 0; + s->is_full = 0; + s->signal_empty = SYNCVAR_EMPTY_INITIALIZER; + s->signal_full = SYNCVAR_EMPTY_INITIALIZER; +} + +void chpl_sync_destroyAux(chpl_sync_aux_t *s) +{ + PROFILE_INCR(profile_sync_destroyAux, 1); +} + +#ifdef SUPPORT_BLOCKREPORT +static void chapel_display_thread(qt_key_t addr, + qthread_f f, + void *arg, + void *retloc, + unsigned int thread_id, + void *tls, + void *callarg) +{ + chpl_atmi_tls_t *rep = (chpl_atmi_tls_t *)tls; + + if (rep) { + if ((rep->lock_lineno > 0) && rep->lock_filename) { + fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n", + chpl_lookupFilename(rep->lock_filename), rep->lock_lineno, + chpl_lookupFilename(rep->chpl_data.task_filename), + rep->chpl_data.task_lineno); + } else if (rep->lock_lineno == 0 && rep->lock_filename) { + fprintf(stderr, + "Waiting for more work (line 0? file:%s) (task %s:%zu)\n", + chpl_lookupFilename(rep->lock_filename), + chpl_lookupFilename(rep->chpl_data.task_filename), + rep->chpl_data.task_lineno); + } else if (rep->lock_lineno == 0) { + fprintf(stderr, + "Waiting for dependencies (uninitialized task %s:%zu)\n", + chpl_lookupFilename(rep->chpl_data.task_filename), + rep->chpl_data.task_lineno); + } + fflush(stderr); + } +} + +static void report_locked_threads(void) +{ + qthread_feb_callback(chapel_display_thread, NULL); + qthread_syncvar_callback(chapel_display_thread, NULL); +} +#endif // SUPPORT_BLOCKREPORT + +static void SIGINT_handler(int sig) +{ + signal(sig, SIG_IGN); + + if (blockreport) { +#ifdef SUPPORT_BLOCKREPORT + report_locked_threads(); +#else + fprintf(stderr, + "Blockreport is currently unsupported by the qthreads " + "tasking layer.\n"); +#endif + } + + if (taskreport) { +#ifdef SUPPORT_TASKREPORT + report_all_tasks(); +#else + fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n"); +#endif + } + + chpl_exit_any(1); +} + +// Tasks + +static void *initializer(void *junk) +{ + qthread_initialize(); + (void) pthread_mutex_lock(&done_init_final_mux); // implicit memory fence + chpl_qthread_done_initializing = 1; + (void) pthread_mutex_unlock(&done_init_final_mux); + + qthread_syncvar_readFF(NULL, &canexit); + + qthread_finalize(); + (void) pthread_mutex_lock(&done_init_final_mux); // implicit memory fence + done_finalizing = 1; + (void) pthread_mutex_unlock(&done_init_final_mux); + + return NULL; +} + +// +// Qthreads environment helper functions. +// + +static char* chpl_qt_getenv_str(const char* var) { + char name[100]; + char* ev; + + snprintf(name, sizeof(name), "QT_%s", var); + if ((ev = getenv(name)) == NULL) { + snprintf(name, sizeof(name), "QTHREAD_%s", var); + ev = getenv(name); + } + + return ev; +} + +static unsigned long int chpl_qt_getenv_num(const char* var, + unsigned long int default_val) { + char* ev; + unsigned long int ret_val = default_val; + + if ((ev = chpl_qt_getenv_str(var)) != NULL) { + unsigned long int val; + if (sscanf(ev, "%lu", &val) == 1) + ret_val = val; + } + + return ret_val; +} + +// Helper function to set a qthreads env var. This is meant to mirror setenv +// functionality, but qthreads has two environment variables for every setting: +// a QT_ and a QTHREAD_ version. We often forget to think about both so this +// wraps the overriding logic. In verbose mode it prints out if we overrode +// values, or if we were prevented from setting values because they existed +// (and override was 0.) +static void chpl_qt_setenv(const char* var, const char* val, + int32_t override) { + char qt_env[QT_ENV_S] = { 0 }; + char qthread_env[QT_ENV_S] = { 0 }; + char *qt_val; + char *qthread_val; + chpl_bool eitherSet = false; + + strncpy(qt_env, "QT_", sizeof(qt_env)); + strncat(qt_env, var, sizeof(qt_env) - 1); + + strncpy(qthread_env, "QTHREAD_", sizeof(qthread_env)); + strncat(qthread_env, var, sizeof(qthread_env) - 1); + + qt_val = getenv(qt_env); + qthread_val = getenv(qthread_env); + eitherSet = (qt_val != NULL || qthread_val != NULL); + + if (override || !eitherSet) { + if (verbosity >= 2 + && override + && eitherSet + && strcmp(val, (qt_val != NULL) ? qt_val : qthread_val) != 0) { + printf("QTHREADS: Overriding the value of %s and %s " + "with %s\n", qt_env, qthread_env, val); + } + (void) setenv(qt_env, val, 1); + (void) setenv(qthread_env, val, 1); + } else if (verbosity >= 2) { + char* set_env = NULL; + char* set_val = NULL; + if (qt_val != NULL) { + set_env = qt_env; + set_val = qt_val; + } else { + set_env = qthread_env; + set_val = qthread_val; + } + if (strcmp(val, set_val) != 0) + printf("QTHREADS: Not setting %s to %s because %s is set to %s and " + "overriding was not requested\n", qt_env, val, set_env, set_val); + } +} + +static void chpl_qt_unsetenv(const char* var) { + char name[100]; + + snprintf(name, sizeof(name), "QT_%s", var); + (void) unsetenv(name); + + snprintf(name, sizeof(name), "QTHREAD_%s", var); + (void) unsetenv(name); +} + +// Determine the number of workers based on environment settings. If a user set +// HWPAR, they are saying they want to use HWPAR many workers, but let the +// runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or +// NUM_WORKERS_PER_SHEPHERD then they must have specific reasons for doing so. +// Returns 0 if no Qthreads env vars related to the number of threads were set, +// what HWPAR was set to if it was set, or -1 if NUM_SHEP and/or NUM_WPS were +// set since we can't figure out before Qthreads init what this will actually +// turn into without duplicating Qthreads logic (-1 is a sentinel for don't +// adjust the values, and give them as is to Qthreads.) +static int32_t chpl_qt_getenv_num_workers(void) { + int32_t hwpar; + int32_t num_wps; + int32_t num_sheps; + + hwpar = (int32_t) chpl_qt_getenv_num("HWPAR", 0); + num_wps = (int32_t) chpl_qt_getenv_num("NUM_WORKERS_PER_SHEPHERD", 0); + num_sheps = (int32_t) chpl_qt_getenv_num("NUM_SHEPHERDS", 0); + + if (hwpar) { + return hwpar; + } else if (num_wps || num_sheps) { + return -1; + } + + return 0; +} + + +// Sets up and returns the amount of hardware parallelism to use, limited to +// maxThreads. Returns -1 if we did not setup parallelism because a user +// explicitly requested a specific layout from qthreads. +static int32_t setupAvailableParallelism(int32_t maxThreads) { + int32_t numThreadsPerLocale; + int32_t qtEnvThreads; + int32_t hwpar; + char newenv_workers[QT_ENV_S] = { 0 }; + + // Experience has shown that Qthreads generally performs best with + // num_workers = numCores (and thus worker_unit = core) but if the user has + // explicitly requested more threads through the chapel or Qthread env + // vars, we override the default. + numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); + qtEnvThreads = chpl_qt_getenv_num_workers(); + hwpar = 0; + + // User set chapel level env var (CHPL_RT_NUM_THREADS_PER_LOCALE) + // This is limited to the number of logical CPUs on the node. + if (numThreadsPerLocale != 0) { + int32_t numPUsPerLocale; + + hwpar = numThreadsPerLocale; + + numPUsPerLocale = chpl_getNumLogicalCpus(true); + if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) { + if (verbosity > 0) { + printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d " + "to prevent oversubscription of the system.\n", + hwpar, numPUsPerLocale); + } + + // Do not oversubscribe the system, use all available resources. + hwpar = numPUsPerLocale; + } + } + // User set qthreads level env var + // (HWPAR or (NUM_SHEPHERDS and NUM_WORKERS_PER_SHEPHERD)) + else if (qtEnvThreads != 0) { + hwpar = qtEnvThreads; + } + // User did not set chapel or qthreads vars -- our default + else { + hwpar = chpl_getNumPhysicalCpus(true); + } + + // hwpar will only be <= 0 if the user set QT_NUM_SHEPHERDS and/or + // QT_NUM_WORKERS_PER_SHEPHERD in which case we assume as "expert" user and + // don't impose any thread limits or set worker_unit. + if (hwpar > 0) { + // Limit the parallelism to the maximum imposed by the comm layer. + if (0 < maxThreads && maxThreads < hwpar) { + hwpar = maxThreads; + } + + // If there is more parallelism requested than the number of cores, set the + // worker unit to pu, otherwise core. + if (hwpar > chpl_getNumPhysicalCpus(true)) { + chpl_qt_setenv("WORKER_UNIT", "pu", 0); + } else { + chpl_qt_setenv("WORKER_UNIT", "core", 0); + } + + // Unset relevant Qthreads environment variables. + chpl_qt_unsetenv("HWPAR"); + chpl_qt_unsetenv("NUM_SHEPHERDS"); + chpl_qt_unsetenv("NUM_WORKERS_PER_SHEPHERD"); + + snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar); + if (CHPL_QTHREAD_SCHEDULER_ONE_WORKER_PER_SHEPHERD) { + chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1); + chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1); + } else { + chpl_qt_setenv("HWPAR", newenv_workers, 1); + } + } + return hwpar; +} + +static void setupCallStacks(int32_t hwpar) { + size_t callStackSize; + + // If the user compiled with no stack checks (either explicitly or + // implicitly) turn off qthread guard pages. TODO there should also be a + // chpl level env var backing this at runtime (can be the same var.) + // Also turn off guard pages if the heap page size isn't the same as + // the system page size, because when that's the case we can reliably + // make the guard pages un-referenceable. (This typically arises when + // the heap is on hugepages, as is often the case on Cray systems.) + // + // Note that we won't override an explicit setting of QT_GUARD_PAGES + // in the former case, but we do in the latter case. + if (CHPL_STACK_CHECKS == 0) { + chpl_qt_setenv("GUARD_PAGES", "false", 0); + } + else if (chpl_getHeapPageSize() != chpl_getSysPageSize()) { + chpl_qt_setenv("GUARD_PAGES", "false", 1); + } + + // Precedence (high-to-low): + // 1) Chapel environment (CHPL_RT_CALL_STACK_SIZE) + // 2) QTHREAD_STACK_SIZE + // 3) Chapel default + if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 || + (chpl_qt_getenv_num("STACK_SIZE", 0) == 0 && + (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) { + char newenv_stack[QT_ENV_S]; + snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize); + chpl_qt_setenv("STACK_SIZE", newenv_stack, 1); + + // Qthreads sets up memory pools expecting the item_size to be small. + // Stacks are allocated in this manner too, but our default stack size + // is quite large, so we limit the max memory allocated for a pool. We + // default to a multiple of callStackSize and hwpar, with the thought + // that available memory is generally proportional to the amount of + // parallelism. For some architectures, this isn't true so we set a max + // upper bound. And if the callStackSize is small, we don't want to + // limit all qthreads pool allocations to a small value, so we have a + // lower bound as well. Note that qthread stacks are slightly larger + // than specified to store a book keeping structure and possibly guard + // pages, so we thrown an extra MB. + if (hwpar > 0) { + const size_t oneMB = 1024 * 1024; + const size_t allocSizeLowerBound = 33 * oneMB; + const size_t allocSizeUpperBound = 513 * oneMB; + size_t maxPoolAllocSize; + char newenv_alloc[QT_ENV_S]; + + maxPoolAllocSize = 2 * hwpar * callStackSize + oneMB; + if (maxPoolAllocSize < allocSizeLowerBound) { + maxPoolAllocSize = allocSizeLowerBound; + } else if (maxPoolAllocSize > allocSizeUpperBound) { + maxPoolAllocSize = allocSizeUpperBound; + } + snprintf(newenv_alloc, sizeof(newenv_alloc), "%zu", maxPoolAllocSize); + chpl_qt_setenv("MAX_POOL_ALLOC_SIZE", newenv_alloc, 0); + } + } +} + +static void setupTasklocalStorage(void) { + unsigned long int tasklocal_size; + char newenv[QT_ENV_S]; + + // Make sure Qthreads knows how much space we need for per-task + // local storage. + tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0); + if (tasklocal_size < sizeof(chpl_atmi_tls_t)) { + snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_atmi_tls_t)); + chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1); + } +} + +static void setupWorkStealing(void) { + // In our experience the current work stealing implementation hurts + // performance, so disable it. Note that we don't override, so a user could + // try working stealing out by setting {QT,QTHREAD}_STEAL_RATIO. Also note + // that not all schedulers support work stealing, but it doesn't hurt to + // set this env var for those configs anyways. + chpl_qt_setenv("STEAL_RATIO", "0", 0); +} + +static aligned_t chapel_wrapper(void *arg); +static aligned_t main_wrapper(void *arg); +static aligned_t dummy_wrapper(); +// If we stored chpl_taskID_t in chpl_task_bundleData_t, +// this struct and the following function may not be necessary. +typedef void (*main_ptr_t)(void); +typedef struct { + chpl_task_bundle_t arg; + main_ptr_t chpl_main; +} main_wrapper_bundle_t; + +void chpl_task_init(void) +{ + atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL); + if(st != ATMI_STATUS_SUCCESS) return; + + g_machine = atmi_machine_get_info(); + + pthread_key_create(&tls_cache, NULL); + //g_num_cpu_kernels = sizeof(chpl_ftable)/sizeof(chpl_ftable[0]); + size_t main_arg_size = sizeof(main_wrapper_bundle_t); + atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size); + atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL); + atmi_kernel_create_empty(&dummy_kernel, 0, NULL); + atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL); + + // this increment needed to let the main task not be treated as an ATMI task + // because we directly launch the main task with the main thread and dont treat + // it as an ATMI task. This increment fools the rest of the runtime to think that + // the main task is an ATMI task, but it is in fact not an ATMI task. + int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed); + atmi_task_handle_t dummy_handle = atmi_task_create(dummy_kernel); + int32_t commMaxThreads; + int32_t hwpar; + pthread_t initer; + pthread_attr_t pAttr; + + chpl_qthread_process_pthread = pthread_self(); + chpl_qthread_process_bundle.id = qthread_incr(&next_task_id, 1); + + commMaxThreads = chpl_comm_getMaxThreads(); + + // Set up hardware parallelism, the stack size and stack guards, + // tasklocal storage, and work stealing + hwpar = setupAvailableParallelism(commMaxThreads); + setupCallStacks(hwpar); + setupTasklocalStorage(); + setupWorkStealing(); + + if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); } + + // Initialize qthreads + pthread_attr_init(&pAttr); + pthread_attr_setdetachstate(&pAttr, PTHREAD_CREATE_DETACHED); + pthread_create(&initer, &pAttr, initializer, NULL); + while (chpl_qthread_done_initializing == 0) + sched_yield(); + + // Now that Qthreads is up and running, do a sanity check and make sure + // that the number of workers is less than any comm layer limit. This is + // mainly need for the case where a user set QT_NUM_SHEPHERDS and/or + // QT_NUM_WORKERS_PER_SHEPHERD in which case we don't impose any limits on + // the number of threads qthreads creates beforehand + assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads); + + if (blockreport || taskreport) { + if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { + perror("Could not register SIGINT handler"); + } + } +} + +void chpl_task_exit(void) +{ + //for(int i = 0; i < g_num_cpu_kernels; i++) { + for(int i = 0; i < max_num_cpu_kernels; i++) { + if(cpu_kernels_initialized[i]) + atmi_kernel_release(cpu_kernels[i]); + } + atmi_kernel_release(dummy_kernel); + atmi_kernel_release(main_kernel); + atmi_finalize(); +#ifdef CHAPEL_PROFILE + profile_print(); +#endif /* CHAPEL_PROFILE */ + + pthread_key_delete(tls_cache); + if (qthread_shep() == NO_SHEPHERD) { + /* sometimes, tasking is told to shutdown even though it hasn't been + * told to start yet */ + if (chpl_qthread_done_initializing == 1) { + qthread_syncvar_fill(&canexit); + while (done_finalizing == 0) + sched_yield(); + } + } else { + qthread_syncvar_fill(&exit_ret); + } +} + +static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind, + chpl_task_bundle_t* bundle) { + if (chpl_task_have_callbacks(event_kind)) { + if (bundle->id == chpl_nullTaskID) + bundle->id = qthread_incr(&next_task_id, 1); + chpl_task_do_callbacks(event_kind, + bundle->requested_fid, + bundle->filename, + bundle->lineno, + bundle->id, + bundle->is_executeOn); + } +} + + +static aligned_t dummy_wrapper() {} + +static aligned_t main_wrapper(void *arg) +{ + chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); + main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg; + chpl_task_bundle_t *bundle = &m_bundle->arg; + chpl_atmi_tls_t pv = {.bundle = bundle}; + + *tls = pv; + + // main call doesn't need countRunning / chpl_taskRunningCntInc + + wrap_callbacks(chpl_task_cb_event_kind_begin, bundle); + + (m_bundle->chpl_main)(); + + wrap_callbacks(chpl_task_cb_event_kind_end, bundle); + + return 0; +} + +static aligned_t chapel_wrapper(void *arg) +{ + chpl_atmi_tls_t *tls = chpl_atmi_get_tasklocal(); + chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; + chpl_atmi_tls_t pv = {.bundle = bundle}; + + *tls = pv; + + if (bundle->countRunning) + chpl_taskRunningCntInc(0, 0); + + wrap_callbacks(chpl_task_cb_event_kind_begin, bundle); + + // launch GPU kernel here? + (bundle->requested_fn)(arg); + + wrap_callbacks(chpl_task_cb_event_kind_end, bundle); + + if (bundle->countRunning) + chpl_taskRunningCntDec(0, 0); + + return 0; +} + +typedef struct { + chpl_fn_p fn; + void *arg; +} comm_task_wrapper_info_t; + +static void *comm_task_wrapper(void *arg) +{ + comm_task_wrapper_info_t *rarg = arg; + chpl_moveToLastCPU(); + (*(chpl_fn_p)(rarg->fn))(rarg->arg); + return 0; +} + +// Start the main task. +// +// Warning: this method is not called within a Qthread task context. Do +// not use methods that require task context (e.g., task-local storage). +void chpl_task_callMain(void (*chpl_main)(void)) +{ + main_wrapper_bundle_t arg; + + arg.arg.serial_state = false; + arg.arg.countRunning = false; + arg.arg.is_executeOn = false; + arg.arg.requestedSubloc = c_sublocid_any_val; + arg.arg.requested_fid = FID_NONE; + arg.arg.requested_fn = NULL; + arg.arg.requested_fn_name = NULL; + arg.arg.lineno = 0; + arg.arg.filename = CHPL_FILE_IDX_MAIN_TASK; + arg.arg.id = chpl_qthread_process_bundle.id; + arg.chpl_main = chpl_main; + + wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg); + + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + //void *arg1 = (void *)&arg; + kernargs[0] = (void *)&arg; + //atmi_task_launch(lparm, main_kernel, kernargs); + + main_wrapper(&arg); + //qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret); + //qthread_syncvar_readFF(NULL, &exit_ret); +} + +void chpl_task_stdModulesInitialized(void) +{ + // + // It's not safe to call the module code to count the main task as + // running until after the modules have been initialized. That's + // when this function is called, so now count the main task. + // + canCountRunningTasks = true; + chpl_taskRunningCntInc(0, 0); +} + +int chpl_task_createCommTask(chpl_fn_p fn, + void *arg) +{ + // + // The wrapper info must be static because it won't be referred to + // until the new pthread calls comm_task_wrapper(). And, it is + // safe for it to be static because we will be called at most once + // on each node. + // + static + comm_task_wrapper_info_t wrapper_info; + wrapper_info.fn = fn; + wrapper_info.arg = arg; + return pthread_create(&chpl_qthread_comm_pthread, + NULL, comm_task_wrapper, &wrapper_info); +} + +void *chpl_taskGroupGet() { + void *ret = (void *)get_atmi_task_group(); + return ret; +} + +void *chpl_taskGroupInit(int lineno, int32_t filename) { + atmi_task_group_t *tg = (atmi_task_group_t *)chpl_malloc(sizeof(atmi_task_group_t)); + // TODO: add to a list of task groups and free all of them at the very end. + int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed); + // loop around the task groups. + // FIXME: how will this affect the main task that is not an ATMI task? Incr by 1 after this? + next_id %= max_num_task_groups; + tg->id = next_id; + tg->ordered = ATMI_FALSE; + return tg; +} + +void chpl_taskGroupComplete() { + // no-op in ATMI. down count in other tasking layers +} + +void chpl_taskGroupFinalize(void *tg) { + int cpu_id = 0; // which subloc? + atmi_task_group_t *cur_tg = get_atmi_task_group(); + + if(cur_tg) { + // if I am within a task, provide a chain dependency to the incoming group + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + lparm->groupable = ATMI_TRUE; + lparm->group = cur_tg; + lparm->num_required_groups = 1; + lparm->required_groups = (atmi_task_group_t **)&tg; + atmi_task_launch(lparm, dummy_kernel, NULL); + } + else { + // if I am not within a task (main task), simply sync + atmi_task_group_sync((atmi_task_group_t *)tg); + } + //chpl_free(tg); +} + +void chpl_task_addToTaskList(chpl_fn_int_t fid, + chpl_task_bundle_t *arg, + size_t arg_size, + c_sublocid_t subloc, + void **task_list, + int32_t task_list_locale, + chpl_bool is_begin_stmt, + void *task_group, + int lineno, + int32_t filename) +{ + //printf("Adding %d fn to task list\n", fid); + chpl_bool serial_state = chpl_task_getSerial(); + chpl_fn_p requested_fn = chpl_ftable[fid]; + + assert(subloc != c_sublocid_none); + + PROFILE_INCR(profile_task_addToTaskList,1); + + if (serial_state) { + // call the function directly. + requested_fn(arg); + } else { + arg->serial_state = false; + arg->countRunning = false; + arg->is_executeOn = false; + arg->requestedSubloc = subloc; + arg->requested_fid = fid; + arg->requested_fn = requested_fn; + arg->requested_fn_name = NULL; + arg->lineno = lineno; + arg->filename = filename; + arg->id = chpl_nullTaskID; + + wrap_callbacks(chpl_task_cb_event_kind_create, arg); + + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + //lparm->synchronous = ATMI_TRUE; + lparm->groupable = ATMI_TRUE; + if(task_group) { + lparm->group = (atmi_task_group_t *)task_group; + } + + void *kernargs[1]; + if(!cpu_kernels_initialized[fid]) { + atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); + atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL); + cpu_kernels_initialized[fid] = 1; + } + kernargs[0] = (void *)arg; + atmi_task_launch(lparm, cpu_kernels[fid], kernargs); + /*if (subloc == c_sublocid_any) { + qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, + NULL, (qthread_shepherd_id_t) subloc); + }*/ + } +} + +void chpl_task_executeTasksInList(void **task_list) +{ + PROFILE_INCR(profile_task_executeTasksInList,1); +} + +static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p fp, + void *arg, size_t arg_size, + c_sublocid_t subloc, chpl_bool serial_state, + int lineno, int32_t filename) +{ + chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg; + + bundle->serial_state = serial_state; + bundle->countRunning = canCountRunningTasks; + bundle->is_executeOn = true; + bundle->requestedSubloc = subloc; + bundle->requested_fid = fid; + bundle->requested_fn = fp; + bundle->requested_fn_name = fname; + bundle->lineno = lineno; + bundle->filename = filename; + bundle->id = chpl_nullTaskID; + + wrap_callbacks(chpl_task_cb_event_kind_create, bundle); + + int cpu_id = 0;//subloc; + ATMI_LPARM_CPU(lparm, cpu_id); + lparm->kernel_id = CPU_FUNCTION_IMPL; + //lparm->synchronous = ATMI_TRUE; + void *kernargs[1]; + if(!cpu_kernels_initialized[fid]) { + atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size); + atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL); + cpu_kernels_initialized[fid] = 1; + } + kernargs[0] = (void *)arg; + atmi_task_launch(lparm, cpu_kernels[fid], kernargs); + /*if (subloc < 0) { + qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL); + } else { + qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size, + NULL, (qthread_shepherd_id_t) 0); + }*/ +} + +void chpl_task_taskCallFTable(chpl_fn_int_t fid, + chpl_task_bundle_t *arg, size_t arg_size, + c_sublocid_t subloc, + int lineno, int32_t filename) +{ + PROFILE_INCR(profile_task_taskCall,1); + + taskCallBody(fid, NULL, chpl_ftable[fid], arg, arg_size, subloc, false, lineno, filename); +} + +void chpl_task_startMovedTask(chpl_fn_int_t fid, + chpl_fn_p fp, + chpl_task_bundle_t *arg, + size_t arg_size, + c_sublocid_t subloc, + chpl_taskID_t id, + chpl_bool serial_state) +{ + // + // For now the incoming task ID is simply dropped, though we check + // to make sure the caller wasn't expecting us to do otherwise. If + // we someday make task IDs global we will need to be able to set + // the ID of this moved task. + // + assert(id == chpl_nullTaskID); + + PROFILE_INCR(profile_task_startMovedTask,1); + + taskCallBody(fid, NULL, fp, arg, arg_size, subloc, serial_state, 0, CHPL_FILE_IDX_UNKNOWN); +} + +// +// chpl_task_getSubloc() is in tasks-qthreads.h +// + +// +// chpl_task_setSubloc() is in tasks-qthreads.h +// + +// +// chpl_task_getRequestedSubloc() is in tasks-qthreads.h +// + + +// Returns '(unsigned int)-1' if called outside of the tasking layer. +chpl_taskID_t chpl_task_getId(void) +{ + PROFILE_INCR(profile_task_getId,1); + + atmi_task_handle_t t = get_atmi_task_handle(); + // Rationale to return 0 instead of chpl_nullTaskID: ATMI returns -1 as the NULL + // task, but we maintain 0 as the null task in chpl-atmi. + // Impl: chpl-atmi does not create a new task for the main wrapper. Instead it + // creates a dummy ATMI task in the beginning but does not launch/activate + // it. This helps us recognize 0 as the main/dummy task and not -1 that is + // expected by ATMI as the NULL task. + // FIXME: this could go against the chpl design of having *every* task being + // launched, including the main wrapper task. Any problems with that? + if(t == ATMI_NULL_TASK_HANDLE) + return (chpl_taskID_t) 0; + + return t; +} + +void chpl_task_sleep(double secs) +{ + if (qthread_shep() == NO_SHEPHERD) { + struct timeval deadline; + struct timeval now; + + // + // Figure out when this task can proceed again, and until then, keep + // yielding. + // + gettimeofday(&deadline, NULL); + deadline.tv_usec += (suseconds_t) lround((secs - trunc(secs)) * 1.0e6); + if (deadline.tv_usec > 1000000) { + deadline.tv_sec++; + deadline.tv_usec -= 1000000; + } + deadline.tv_sec += (time_t) trunc(secs); + + do { + chpl_task_yield(); + gettimeofday(&now, NULL); + } while (now.tv_sec < deadline.tv_sec + || (now.tv_sec == deadline.tv_sec + && now.tv_usec < deadline.tv_usec)); + } else { + qtimer_t t = qtimer_create(); + qtimer_start(t); + do { + qthread_yield(); + qtimer_stop(t); + } while (qtimer_secs(t) < secs); + qtimer_destroy(t); + } +} + +/* The get- and setSerial() methods assume the beginning of the task-local + * data segment holds a chpl_bool denoting the serial state. */ +#if 0 +chpl_bool get_serial(chpl_taskID_t id) { + if(g_task_map.find(id) == g_task_map.end()) + return true; + else + return g_task_map[id]; +} +#endif + +chpl_bool chpl_task_getSerial(void) +{ + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + + PROFILE_INCR(profile_task_getSerial,1); + + return data->bundle->serial_state; +} + +void chpl_task_setSerial(chpl_bool state) +{ + chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal(); + data->bundle->serial_state = state; + + PROFILE_INCR(profile_task_setSerial,1); +} + +uint32_t chpl_task_getMaxPar(void) { + //chpl_internal_error("Qthreads max tasks par asked\n"); + //printf("Qthreads max task par asked\n"); + // + // We assume here that the caller (in the LocaleModel module code) + // is interested in the number of workers on the whole node, and + // will decide itself how much parallelism to create across and + // within sublocales, if there are any. + // + return qthread_num_workers(); + //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count; +} + +c_sublocid_t chpl_task_getNumSublocales(void) +{ + // FIXME: What we really want here is the number of NUMA + // sublocales we are supporting. For now we use the number of + // shepherds as a proxy for that. + // return (c_sublocid_t) qthread_num_shepherds(); + return (c_sublocid_t) g_machine->device_count_by_type[ATMI_DEVTYPE_ALL]; +} + +size_t chpl_task_getCallStackSize(void) +{ + PROFILE_INCR(profile_task_getCallStackSize,1); + + return qthread_readstate(STACK_SIZE); +} + +// XXX: Should probably reflect all shepherds +uint32_t chpl_task_getNumQueuedTasks(void) +{ + return qthread_readstate(NODE_BUSYNESS); +} + +uint32_t chpl_task_getNumRunningTasks(void) +{ + chpl_internal_error("chpl_task_getNumRunningTasks() called"); + return 1; +} + +int32_t chpl_task_getNumBlockedTasks(void) +{ + // This isn't accurate, but in the absence of better information + // it's the best we can do. + return 0; +} + +// Threads + +uint32_t chpl_task_getNumThreads(void) +{ + return qthread_num_workers(); + //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count; +} + +// Ew. Talk about excessive bookkeeping. +uint32_t chpl_task_getNumIdleThreads(void) +{ + return 0; +} + +/* vim:set expandtab: */ diff --git a/util/printchplenv b/util/printchplenv index e66b1d2694..5ad79dff86 100755 --- a/util/printchplenv +++ b/util/printchplenv @@ -175,6 +175,8 @@ def print_mode(mode='list', anonymize=False): mode, filters=('runtime',)) if tasks == 'qthreads': link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args()) + if tasks == 'atmi': + link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args()) print_var(' CHPL_RE2_UNIQ_CFG_PATH', chpl_3p_re2_configs.get_uniq_cfg_path(), diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash index cfd76b1c41..0dee283cc9 100644 --- a/util/setchplenv_hsa.bash +++ b/util/setchplenv_hsa.bash @@ -61,8 +61,8 @@ export CHPL_COMM=none echo " to none" echo -n "Setting CHPL_TASKS" -export CHPL_TASKS=qthreads -echo " to qthreads" +export CHPL_TASKS=atmi +echo " to atmi" echo -n "Setting CHPL_ATOMICS" export CHPL_ATOMICS=intrinsics @@ -92,6 +92,10 @@ echo -n "Disabling NUMA" export CHPL_HWLOC_CFG_OPTIONS=" --disable-libnuma" echo " done" +echo -n "Setting CHPL_HWLOC" +export CHPL_HWLOC=hwloc +echo " to hwloc" + echo -n "Disabling LLVM support" export CHPL_LLVM=none echo " done" @@ -105,6 +109,10 @@ if [ "$1" == "debug" ]; then export CHPL_DEBUG=1 fi +echo -n "Setting CHPL_ROCM" +export CHPL_ROCM=1 +echo " to 1" + echo -n "Setting CHPL_LOCALE_MODEL" export CHPL_LOCALE_MODEL=hsa echo " to hsa"