diff --git a/compiler/AST/build.cpp b/compiler/AST/build.cpp
index 6bacf2f15e..b9412aba95 100644
--- a/compiler/AST/build.cpp
+++ b/compiler/AST/build.cpp
@@ -1609,12 +1609,21 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices,
       beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL));
       addByrefVars(beginBlk, byref_vars);
       beginBlk->insertAtHead(body);
+#ifndef TARGET_HSA
       beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount));
+#else
+      beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount));
+#endif
       BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered);
       block->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue)));
       block->insertAtHead(new DefExpr(coforallCount));
+#ifndef TARGET_HSA
       beginBlk->insertBefore(new CallExpr("_upEndCount", coforallCount));
       block->insertAtTail(new CallExpr("_waitEndCount", coforallCount));
+#else
+      beginBlk->insertBefore(new CallExpr("_initTaskGroup", coforallCount));
+      block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount));
+#endif
       block->insertAtTail(new CallExpr("_endCountFree", coforallCount));
       nonVectorCoforallBlk->insertAtTail(block);
     }
@@ -1624,15 +1633,27 @@ BlockStmt* buildCoforallLoopStmt(Expr* indices,
       beginBlk->blockInfoSet(new CallExpr(PRIM_BLOCK_COFORALL));
       addByrefVars(beginBlk, byref_vars);
       beginBlk->insertAtHead(body->copy());
+#ifndef TARGET_HSA
       beginBlk->insertAtTail(new CallExpr("_downEndCount", coforallCount));
+#else
+      beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", coforallCount));
+#endif
       VarSymbol* numTasks = newTemp("numTasks");
       vectorCoforallBlk->insertAtTail(new DefExpr(numTasks));
       vectorCoforallBlk->insertAtTail(new CallExpr(PRIM_MOVE, numTasks, new CallExpr(".", tmpIter,  new_CStringSymbol("size"))));
+#ifndef TARGET_HSA
       vectorCoforallBlk->insertAtTail(new CallExpr("_upEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#else
+      vectorCoforallBlk->insertAtTail(new CallExpr("_initTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#endif
       BlockStmt* block = ForLoop::buildForLoop(indices, new SymExpr(tmpIter), beginBlk, true, zippered);
       vectorCoforallBlk->insertAtHead(new CallExpr(PRIM_MOVE, coforallCount, new CallExpr("_endCountAlloc", /*forceLocalTypes=*/gTrue)));
       vectorCoforallBlk->insertAtHead(new DefExpr(coforallCount));
+#ifndef TARGET_HSA
       block->insertAtTail(new CallExpr("_waitEndCount", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#else
+      block->insertAtTail(new CallExpr("_finalizeTaskGroup", coforallCount, /*countRunningTasks=*/gTrue, numTasks));
+#endif
       block->insertAtTail(new CallExpr("_endCountFree", coforallCount));
       vectorCoforallBlk->insertAtTail(block);
     }
@@ -2890,12 +2911,20 @@ buildBeginStmt(CallExpr* byref_vars, Expr* stmt) {
     return body;
   } else {
     BlockStmt* block = buildChapelStmt();
+#ifndef TARGET_HSA
     block->insertAtTail(new CallExpr("_upEndCount"));
+#else
+    block->insertAtTail(new CallExpr("_addToTaskGroup"));
+#endif
     BlockStmt* beginBlock = new BlockStmt();
     beginBlock->blockInfoSet(new CallExpr(PRIM_BLOCK_BEGIN));
     addByrefVars(beginBlock, byref_vars);
     beginBlock->insertAtHead(stmt);
+#ifndef TARGET_HSA
     beginBlock->insertAtTail(new CallExpr("_downEndCount"));
+#else
+    beginBlock->insertAtTail(new CallExpr("_completeTaskGroup"));
+#endif
     block->insertAtTail(beginBlock);
     return block;
   }
@@ -2911,7 +2940,11 @@ buildSyncStmt(Expr* stmt) {
   block->insertAtTail(new CallExpr(PRIM_MOVE, endCountSave, new CallExpr(PRIM_GET_END_COUNT)));
   block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gFalse)));
   block->insertAtTail(stmt);
+#ifndef TARGET_HSA
   block->insertAtTail(new CallExpr("_waitEndCount"));
+#else
+  block->insertAtTail(new CallExpr("_finalizeTaskGroup"));
+#endif
   block->insertAtTail(new CallExpr("_endCountFree", new CallExpr(PRIM_GET_END_COUNT)));
   block->insertAtTail(new CallExpr(PRIM_SET_END_COUNT, endCountSave));
   return block;
@@ -2947,13 +2980,24 @@ buildCobeginStmt(CallExpr* byref_vars, BlockStmt* block) {
     addByrefVars(beginBlk, byref_vars ? byref_vars->copy() : NULL);
     stmt->insertBefore(beginBlk);
     beginBlk->insertAtHead(stmt->remove());
+#ifndef TARGET_HSA
     beginBlk->insertAtTail(new CallExpr("_downEndCount", cobeginCount));
     block->insertAtHead(new CallExpr("_upEndCount", cobeginCount));
+#else
+    beginBlk->insertAtTail(new CallExpr("_completeTaskGroup", cobeginCount));
+#endif
   }
+#ifdef TARGET_HSA
+    block->insertAtHead(new CallExpr("_initTaskGroup", cobeginCount));
+#endif
 
   block->insertAtHead(new CallExpr(PRIM_MOVE, cobeginCount, new CallExpr("_endCountAlloc", /* forceLocalTypes= */gTrue)));
   block->insertAtHead(new DefExpr(cobeginCount));
+#ifndef TARGET_HSA
   block->insertAtTail(new CallExpr("_waitEndCount", cobeginCount));
+#else  
+  block->insertAtTail(new CallExpr("_finalizeTaskGroup", cobeginCount));
+#endif
   block->insertAtTail(new CallExpr("_endCountFree", cobeginCount));
 
   block->astloc = cobeginCount->astloc; // grab the location of 'cobegin' kw
diff --git a/compiler/AST/expr.cpp b/compiler/AST/expr.cpp
index a651651883..feb9b73209 100644
--- a/compiler/AST/expr.cpp
+++ b/compiler/AST/expr.cpp
@@ -793,7 +793,8 @@ CallExpr::CallExpr(BaseAST* base,
                    BaseAST* arg2,
                    BaseAST* arg3,
                    BaseAST* arg4,
-                   BaseAST* arg5) :
+                   BaseAST* arg5,
+                   BaseAST* arg6) :
   Expr(E_CallExpr),
   primitive(NULL),
   baseExpr(NULL),
@@ -815,6 +816,7 @@ CallExpr::CallExpr(BaseAST* base,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
@@ -827,7 +829,8 @@ CallExpr::CallExpr(PrimitiveOp* prim,
                    BaseAST*     arg2,
                    BaseAST*     arg3,
                    BaseAST*     arg4,
-                   BaseAST*     arg5) :
+                   BaseAST*     arg5,
+                   BaseAST*     arg6) :
   Expr(E_CallExpr),
   primitive(prim),
   baseExpr(NULL),
@@ -840,6 +843,7 @@ CallExpr::CallExpr(PrimitiveOp* prim,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
@@ -851,7 +855,8 @@ CallExpr::CallExpr(PrimitiveTag prim,
                    BaseAST*     arg2,
                    BaseAST*     arg3,
                    BaseAST*     arg4,
-                   BaseAST*     arg5) :
+                   BaseAST*     arg5,
+                   BaseAST*     arg6) :
   Expr(E_CallExpr),
   primitive(primitives[prim]),
   baseExpr(NULL),
@@ -864,6 +869,7 @@ CallExpr::CallExpr(PrimitiveTag prim,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
@@ -876,7 +882,8 @@ CallExpr::CallExpr(const char* name,
                    BaseAST*    arg2,
                    BaseAST*    arg3,
                    BaseAST*    arg4,
-                   BaseAST*    arg5) :
+                   BaseAST*    arg5,
+                   BaseAST*    arg6) :
   Expr(E_CallExpr),
   primitive(NULL),
   baseExpr(new UnresolvedSymExpr(name)),
@@ -889,6 +896,7 @@ CallExpr::CallExpr(const char* name,
   callExprHelper(this, arg3);
   callExprHelper(this, arg4);
   callExprHelper(this, arg5);
+  callExprHelper(this, arg6);
 
   argList.parent = this;
 
diff --git a/compiler/codegen/expr.cpp b/compiler/codegen/expr.cpp
index c3f0fd583a..1fee3d395e 100644
--- a/compiler/codegen/expr.cpp
+++ b/compiler/codegen/expr.cpp
@@ -5350,11 +5350,12 @@ void CallExpr::codegenInvokeOnFun() {
 void CallExpr::codegenInvokeTaskFun(const char* name) {
   FnSymbol*           fn            = isResolved();
   GenRet              taskList      = codegenValue(get(1));
+  GenRet              taskGroup     = codegenValue(get(6));
   GenRet              taskListNode;
   GenRet              taskBundle;
   GenRet              bundleSize;
 
-  std::vector<GenRet> args(8);
+  std::vector<GenRet> args(9);
 
   // get(1) is a ref/wide ref to a task list value
   // get(2) is the node ID owning the task list
@@ -5374,9 +5375,10 @@ void CallExpr::codegenInvokeTaskFun(const char* name) {
   args[2]      = codegenCast("chpl_task_bundle_p", taskBundle);
   args[3]      = bundleSize;
   args[4]      = taskList;
-  args[5]      = codegenValue(taskListNode);
-  args[6]      = fn->linenum();
-  args[7]      = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32);
+  args[5]      = taskGroup;
+  args[6]      = codegenValue(taskListNode);
+  args[7]      = fn->linenum();
+  args[8]      = new_IntSymbol(gFilenameLookupCache[fn->fname()], INT_SIZE_32);
 
   genComment(fn->cname, true);
 
diff --git a/compiler/include/expr.h b/compiler/include/expr.h
index a26acaafed..5233984569 100644
--- a/compiler/include/expr.h
+++ b/compiler/include/expr.h
@@ -199,28 +199,32 @@ class CallExpr : public Expr {
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   CallExpr(PrimitiveOp* prim,
            BaseAST*     arg1 = NULL,
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   CallExpr(PrimitiveTag prim,
            BaseAST*     arg1 = NULL,
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   CallExpr(const char*  name,
            BaseAST*     arg1 = NULL,
            BaseAST*     arg2 = NULL,
            BaseAST*     arg3 = NULL,
            BaseAST*     arg4 = NULL,
-           BaseAST*     arg5 = NULL);
+           BaseAST*     arg5 = NULL,
+           BaseAST*     arg6 = NULL);
 
   ~CallExpr();
 
diff --git a/compiler/passes/buildDefaultFunctions.cpp b/compiler/passes/buildDefaultFunctions.cpp
index 6ffa649baf..d8e6cff7c3 100644
--- a/compiler/passes/buildDefaultFunctions.cpp
+++ b/compiler/passes/buildDefaultFunctions.cpp
@@ -581,7 +581,11 @@ static void build_chpl_entry_points() {
   // endcount (see comment above)
   //
   if (fMinimalModules == false) {
+#ifndef TARGET_HSA
     chpl_gen_main->insertAtTail(new CallExpr("_waitEndCount"));
+#else    
+    chpl_gen_main->insertAtTail(new CallExpr("_finalizeTaskGroup"));
+#endif
   }
 
   chpl_gen_main->insertAtTail(new CallExpr(PRIM_RETURN, main_ret));
diff --git a/compiler/passes/insertLineNumbers.cpp b/compiler/passes/insertLineNumbers.cpp
index 1c4d34beae..a9e302366d 100644
--- a/compiler/passes/insertLineNumbers.cpp
+++ b/compiler/passes/insertLineNumbers.cpp
@@ -325,8 +325,8 @@ static void moveLinenoInsideArgBundle()
     //  than the expected number.  Both block types below expect an
     //  argument bundle, and the on-block expects an additional argument
     //  that is the locale on which it should be executed.
-    if ((fn->numFormals() > 4 && fn->hasFlag(FLAG_ON_BLOCK)) ||
-        (fn->numFormals() > 5 && !fn->hasFlag(FLAG_ON_BLOCK) &&
+    if ((fn->numFormals() > 5 && fn->hasFlag(FLAG_ON_BLOCK)) ||
+        (fn->numFormals() > 6 && !fn->hasFlag(FLAG_ON_BLOCK) &&
          (fn->hasFlag(FLAG_BEGIN_BLOCK) ||
           fn->hasFlag(FLAG_COBEGIN_OR_COFORALL_BLOCK)))) {
 
diff --git a/compiler/passes/parallel.cpp b/compiler/passes/parallel.cpp
index 78ef16120e..e88151bc20 100644
--- a/compiler/passes/parallel.cpp
+++ b/compiler/passes/parallel.cpp
@@ -99,7 +99,7 @@ static BundleArgsFnData bundleArgsFnDataInit = { true, NULL, NULL };
 static void insertEndCounts();
 static void passArgsToNestedFns();
 static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnData &baData);
-static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode);
+static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol* args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn, Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup);
 static void findBlockRefActuals(Vec<Symbol*>& refSet, Vec<Symbol*>& refVec);
 static void findHeapVarsAndRefs(Map<Symbol*,Vec<SymExpr*>*>& defMap,
                                 Vec<Symbol*>& refSet, Vec<Symbol*>& refVec,
@@ -410,6 +410,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) {
   // first argument to a task launch function.
   Symbol* endCount = NULL;
   VarSymbol *taskList = NULL;
+  VarSymbol *taskGroup = NULL;
   VarSymbol *taskListNode = NULL;
 
   if (!fn->hasFlag(FLAG_ON)) {
@@ -466,6 +467,16 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) {
                                                   endCount,
                                                   endCount->typeInfo()->getField("taskList"))));
 
+    // Now get the taskGroup field out of the end count.
+
+    taskGroup = newTemp(astr("_taskGroup", fn->name), dtCVoidPtr);
+
+    fcall->insertBefore(new DefExpr(taskGroup));
+    fcall->insertBefore(new CallExpr(PRIM_MOVE, taskGroup,
+                                     new CallExpr(PRIM_GET_MEMBER,
+                                                  endCount,
+                                                  endCount->typeInfo()->getField("taskGroup"))));
+
 
     // Now get the node ID field for the end count,
     // which is where the task list is stored.
@@ -481,7 +492,7 @@ bundleArgs(CallExpr* fcall, BundleArgsFnData &baData) {
     create_block_fn_wrapper(fn, fcall, baData);
 
   // call wrapper-function
-  call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode);
+  call_block_fn_wrapper(fn, fcall, allocated_args, tmpsz, tempc, baData.wrap_fn, taskList, taskListNode, taskGroup);
   baData.firstCall = false;
 }
 
@@ -492,7 +503,8 @@ static CallExpr* helpFindDownEndCount(BlockStmt* block)
   while (cur && (isCallExpr(cur) || isDefExpr(cur) || isBlockStmt(cur))) {
     if (CallExpr* call = toCallExpr(cur)) {
       if (call->isResolved())
-        if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0)
+        if (strcmp(call->resolvedFunction()->name, "_downEndCount") == 0 ||
+            strcmp(call->resolvedFunction()->name, "_completeTaskGroup") == 0)
           return call;
     } else if (BlockStmt* inner = toBlockStmt(cur)) {
       // Need to handle local blocks since the compiler
@@ -646,6 +658,10 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD
                                             dtCVoidPtr->refType );
     taskListArg->addFlag(FLAG_NO_CODEGEN);
     wrap_fn->insertFormalAtTail(taskListArg);
+    ArgSymbol *taskGroupArg = new ArgSymbol( INTENT_IN, "dummy_taskGroup", 
+                                            dtCVoidPtr->refType );
+    taskGroupArg->addFlag(FLAG_NO_CODEGEN);
+    wrap_fn->insertFormalAtTail(taskGroupArg);
     ArgSymbol *taskListNode = new ArgSymbol( INTENT_IN, "dummy_taskListNode", 
                                              dtInt[INT_SIZE_DEFAULT]);
     taskListNode->addFlag(FLAG_NO_CODEGEN);
@@ -749,20 +765,20 @@ static void create_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, BundleArgsFnD
 
 static void call_block_fn_wrapper(FnSymbol* fn, CallExpr* fcall, VarSymbol*
     args_buf, VarSymbol* args_buf_len, VarSymbol* tempc, FnSymbol *wrap_fn,
-    Symbol* taskList, Symbol* taskListNode)
+    Symbol* taskList, Symbol* taskListNode, Symbol *taskGroup)
 {
   // The wrapper function is called with the bundled argument list.
   if (fn->hasFlag(FLAG_ON)) {
     // For an on block, the first argument is also passed directly
     // to the wrapper function.
     // The forking function uses this to fork a task on the target locale.
-    fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc));
+    fcall->insertBefore(new CallExpr(wrap_fn, fcall->get(1)->remove(), args_buf, args_buf_len, tempc));//, new SymExpr(taskGroup)));
   } else {
     // For non-on blocks, the task list is passed directly to the function
     // (so that codegen can find it).
     // We need the taskList.
     INT_ASSERT(taskList);
-    fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc));
+    fcall->insertBefore(new CallExpr(wrap_fn, new SymExpr(taskList), new SymExpr(taskListNode), args_buf, args_buf_len, tempc, new SymExpr(taskGroup)));
   }
 
   fcall->remove();                     // rm orig. call
diff --git a/make/compiler/Makefile.hsa b/make/compiler/Makefile.hsa
index fa5115fa79..0fc1c3d234 100644
--- a/make/compiler/Makefile.hsa
+++ b/make/compiler/Makefile.hsa
@@ -17,11 +17,11 @@ include $(CHPL_MAKE_HOME)/make/compiler/Makefile.gnu
 ifdef CHPL_ROCM
 # ROCm locations
 CLOC=/opt/rocm/cloc/bin/cloc.sh
-LIBS+=-lhsa-runtime64 -lhsakmt -lm
+LIBS+=-latmi_runtime -lm
 
 # TODO: move these in third-party directory?
-GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib
-HSA_INCLUDES=-I/opt/rocm/hsa/include
+GEN_LFLAGS+=-L/opt/rocm/lib -L/opt/rocm/hsa/lib -L/opt/rocm/atmi/lib
+HSA_INCLUDES=-I/opt/rocm/atmi/include
 else
 # HSA locations
 CLOC=$(THIRD_PARTY_DIR)/hsa/cloc/bin/cloc.sh
diff --git a/make/tasks/Makefile.atmi b/make/tasks/Makefile.atmi
new file mode 100644
index 0000000000..00ca53fb62
--- /dev/null
+++ b/make/tasks/Makefile.atmi
@@ -0,0 +1,19 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+#
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+RUNTIME_INCLS += -I$(QTHREAD_INCLUDE_DIR)
+CHPL_MAKE_THREADS=none
diff --git a/modules/internal/ChapelBase.chpl b/modules/internal/ChapelBase.chpl
index fcdb87690c..ba6210d01b 100644
--- a/modules/internal/ChapelBase.chpl
+++ b/modules/internal/ChapelBase.chpl
@@ -763,7 +763,8 @@ module ChapelBase {
     type taskType;
     var i: iType,
         taskCnt: taskType,
-        taskList: c_void_ptr = _defaultOf(c_void_ptr);
+        taskList: c_void_ptr = _defaultOf(c_void_ptr),
+        taskGroup: c_void_ptr = _defaultOf(c_void_ptr);
   }
 
   // This function is called once by the initiating task.  No on
@@ -786,10 +787,17 @@ module ChapelBase {
     }
   }
 
+
   // Compiler looks for this variable to determine the return type of
   // the "get end count" primitive.
   type _remoteEndCountType = _endCountAlloc(false).type;
 
+  pragma "insert line file info"
+  extern proc chpl_taskGroupInit(): c_void_ptr;
+  extern proc chpl_taskGroupGet(): c_void_ptr;
+  extern proc chpl_taskGroupComplete();
+  extern proc chpl_taskGroupFinalize(task_group: c_void_ptr);
+ 
   // This function is called once by the initiating task.  As above, no
   // on statement needed.
   pragma "dont disable remote value forwarding"
@@ -797,6 +805,36 @@ module ChapelBase {
     delete e;
   }
 
+  // This function is called by the initiating task once for each new
+  // task *before* any of the tasks are started.  As above, no on
+  // statement needed.
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true) {
+    e.taskGroup = chpl_taskGroupGet();
+  }
+
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _addToTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) {
+    e.taskGroup = chpl_taskGroupGet();
+  }
+
+  // This function is called by the initiating task once for each new
+  // task *before* any of the tasks are started.  As above, no on
+  // statement needed.
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _initTaskGroup(e: _EndCount, param countRunningTasks=true) {
+    e.taskGroup = chpl_taskGroupInit();
+  }
+
+  pragma "dont disable remote value forwarding"
+  pragma "no remote memory fence"
+  proc _initTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) {
+    e.taskGroup = chpl_taskGroupInit();
+  }
+
   // This function is called by the initiating task once for each new
   // task *before* any of the tasks are started.  As above, no on
   // statement needed.
@@ -838,6 +876,26 @@ module ChapelBase {
     e.i.sub(1, memory_order_release);
   }
 
+  // This function is called once by each newly initiated task.  No on
+  // statement is needed because the call to sub() will do a remote
+  // fork (on) if needed.
+  pragma "dont disable remote value forwarding"
+  proc _completeTaskGroup(e: _EndCount) {
+    chpl_taskGroupComplete();
+  }
+
+  // This function is called once by the initiating task.  As above, no
+  // on statement needed.
+  pragma "dont disable remote value forwarding"
+  proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true) {
+      chpl_taskGroupFinalize(e.taskGroup);
+  }
+
+  pragma "dont disable remote value forwarding"
+  proc _finalizeTaskGroup(e: _EndCount, param countRunningTasks=true, numTasks) {
+    chpl_taskGroupFinalize(e.taskGroup);
+  }
+
   // This function is called once by the initiating task.  As above, no
   // on statement needed.
   pragma "dont disable remote value forwarding"
@@ -867,7 +925,7 @@ module ChapelBase {
   proc _waitEndCount(e: _EndCount, param countRunningTasks=true, numTasks) {
     // See if we can help with any of the started tasks
     chpl_taskListExecute(e.taskList);
-
+    
     // Wait for all tasks to finish
     e.i.waitFor(0, memory_order_acquire);
 
@@ -876,6 +934,26 @@ module ChapelBase {
     }
   }
 
+  proc _addToTaskGroup(param countRunningTasks=true) {
+    var e = __primitive("get end count");
+    _addToTaskGroup(e, countRunningTasks);
+  }
+
+  proc _initTaskGroup(param countRunningTasks=true) {
+    var e = __primitive("get end count");
+    _initTaskGroup(e, countRunningTasks);
+  }
+
+  proc _completeTaskGroup() {
+    var e = __primitive("get end count");
+    _completeTaskGroup(e);
+  }
+
+  proc _finalizeTaskGroup(param countRunningTasks=true) {
+    var e = __primitive("get end count");
+    _finalizeTaskGroup(e, countRunningTasks);
+  }
+
   proc _upEndCount(param countRunningTasks=true) {
     var e = __primitive("get end count");
     _upEndCount(e, countRunningTasks);
diff --git a/modules/internal/LocaleModelHelpRuntime.chpl b/modules/internal/LocaleModelHelpRuntime.chpl
index 1eca09e9a9..33550f1ae2 100644
--- a/modules/internal/LocaleModelHelpRuntime.chpl
+++ b/modules/internal/LocaleModelHelpRuntime.chpl
@@ -118,7 +118,7 @@ module LocaleModelHelpRuntime {
                                       args: chpl_task_bundle_p, args_size: size_t,
                                       subloc_id: int,
                                       ref tlist: c_void_ptr, tlist_node_id: int,
-                                      is_begin: bool);
+                                      is_begin: bool, tgroup: c_void_ptr);
   extern proc chpl_task_executeTasksInList(ref tlist: c_void_ptr);
 
   //
@@ -131,10 +131,12 @@ module LocaleModelHelpRuntime {
                              args: chpl_task_bundle_p,      // function args
                              args_size: size_t,     // args size
                              ref tlist: c_void_ptr, // task list
+                             tgroup: c_void_ptr, // task group
                              tlist_node_id: int     // task list owner node
                             ) {
+    warning("begin fn : ", fn);
     chpl_task_addToTaskList(fn, args, args_size,
-                            subloc_id, tlist, tlist_node_id, true);
+                            subloc_id, tlist, tlist_node_id, true, tgroup);
   }
 
   //
@@ -148,10 +150,12 @@ module LocaleModelHelpRuntime {
                               args: chpl_task_bundle_p,      // function args
                               args_size: size_t,     // args size
                               ref tlist: c_void_ptr, // task list
+                              tgroup: c_void_ptr, // task group
                               tlist_node_id: int     // task list owner node
                              ) {
+    warning("cobegin fn : ", fn);
     chpl_task_addToTaskList(fn, args, args_size,
-                            subloc_id, tlist, tlist_node_id, false);
+                            subloc_id, tlist, tlist_node_id, false, tgroup);
    }
 
   //
diff --git a/modules/internal/LocaleModelHelpSetup.chpl b/modules/internal/LocaleModelHelpSetup.chpl
index 69c3562ec1..93ab89f26d 100644
--- a/modules/internal/LocaleModelHelpSetup.chpl
+++ b/modules/internal/LocaleModelHelpSetup.chpl
@@ -172,9 +172,6 @@ module LocaleModelHelpSetup {
 
     helpSetupLocaleFlat(dst, local_name);
 
-    extern proc chpl_task_getNumSublocales(): int(32);
-    numSublocales = chpl_task_getNumSublocales();
-
     extern proc chpl_task_getMaxPar(): uint(32);
 
 
@@ -191,7 +188,8 @@ module LocaleModelHelpSetup {
     then local_name = chpl_nodeName():string + "-" + _node_id : string;
     else local_name = chpl_nodeName():string;
 
-    numSublocales = 2;
+    extern proc chpl_task_getNumSublocales(): int(32);
+    numSublocales = chpl_task_getNumSublocales();
 
     const origSubloc = chpl_task_getRequestedSubloc();
 
diff --git a/modules/internal/localeModels/hsa/LocaleModel.chpl b/modules/internal/localeModels/hsa/LocaleModel.chpl
index 8c54136b3e..627cfd0d19 100644
--- a/modules/internal/localeModels/hsa/LocaleModel.chpl
+++ b/modules/internal/localeModels/hsa/LocaleModel.chpl
@@ -157,7 +157,7 @@ module LocaleModel {
       return {0..#numSublocales};
     }
 
-    proc getChildCount() return 0;
+    proc getChildCount() return numSublocales;
 
     iter getChildIndices() : int {
       for idx in {0..#numSublocales} do // chpl_emptyLocaleSpace do
diff --git a/runtime/Makefile.help b/runtime/Makefile.help
index 00e96f04c4..a6d84c1ff5 100644
--- a/runtime/Makefile.help
+++ b/runtime/Makefile.help
@@ -168,4 +168,7 @@ include $(RUNTIME_ROOT)/make/Makefile.runtime.foot
 src/tasks/qthreads/Makefile.include:
 	cd src/tasks/qthreads && $(MAKE) copy_qthread_files
 
+src/tasks/atmi/Makefile.include:
+	cd src/tasks/atmi && $(MAKE) copy_qthread_files
+
 .NOTPARALLEL:
diff --git a/runtime/etc/Makefile.tasks-atmi b/runtime/etc/Makefile.tasks-atmi
new file mode 100644
index 0000000000..ce328fd06b
--- /dev/null
+++ b/runtime/etc/Makefile.tasks-atmi
@@ -0,0 +1,19 @@
+# Copyright 2004-2015 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+GEN_CFLAGS += -I$(QTHREAD_INCLUDE_DIR)
+GEN_LFLAGS += -L$(QTHREAD_LIB_DIR) -Wl,-rpath,$(QTHREAD_LIB_DIR)
diff --git a/runtime/include/chpl-atmi.h b/runtime/include/chpl-atmi.h
new file mode 100644
index 0000000000..82c8a54780
--- /dev/null
+++ b/runtime/include/chpl-atmi.h
@@ -0,0 +1,34 @@
+#ifndef _chpl_atmi_h_
+#define _chpl_atmi_h_
+
+#include <atmi_runtime.h>
+#include <stddef.h> /* size_t */
+#include <stdint.h> /* uintXX_t */
+#ifndef __cplusplus
+#include <stdbool.h>
+#endif /* __cplusplus */
+
+#include "chpltypes.h"
+#include "chpl-hsa-kernelparams.h"
+
+atmi_kernel_t reduction_kernel;
+atmi_kernel_t *gpu_kernels;
+atmi_kernel_t main_kernel;
+int g_num_cpu_kernels;
+
+atmi_machine_t *g_machine;
+
+enum {
+    GPU_KERNEL_IMPL = 10565,
+    REDUCTION_GPU_IMPL = 42,
+    CPU_FUNCTION_IMPL = 43
+};    
+
+int chpl_hsa_initialize(void);
+
+int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count);
+int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count);
+
+void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
+                        uint32_t wkitem_count_x, void *bundled_args);
+#endif //_chpl_atmi_h_
diff --git a/runtime/include/chpl-gen-includes.h b/runtime/include/chpl-gen-includes.h
index 4527659c5d..2f414baa78 100644
--- a/runtime/include/chpl-gen-includes.h
+++ b/runtime/include/chpl-gen-includes.h
@@ -31,7 +31,7 @@
 #include "chpl-tasks.h"
 #include "chpltypes.h"
 #ifdef TARGET_HSA
-#include "chpl-hsa.h"
+#include "chpl-atmi.h"
 #endif
 
 //
diff --git a/runtime/include/chpl-tasks.h b/runtime/include/chpl-tasks.h
index 6eac21fb8c..d9fb916627 100644
--- a/runtime/include/chpl-tasks.h
+++ b/runtime/include/chpl-tasks.h
@@ -158,10 +158,15 @@ void chpl_task_addToTaskList(
          void**,             // task list
          c_nodeid_t,         // locale (node) where task list resides
          chpl_bool,          // is begin{} stmt?  (vs. cobegin or coforall)
+         void *,             // task group
          int,                // line at which function begins
          int32_t);           // name of file containing function
 void chpl_task_executeTasksInList(void**);
 
+void *chpl_taskGroupInit(int lineno, int32_t filename);
+void *chpl_taskGroupGet();
+void chpl_taskGroupFinalize(void *task_group);
+void chpl_taskGroupComplete();
 //
 // Call a chpl_ftable[] function in a task.
 //
diff --git a/runtime/include/chpltypes.h b/runtime/include/chpltypes.h
index 0e6b4644df..439bef780b 100644
--- a/runtime/include/chpltypes.h
+++ b/runtime/include/chpltypes.h
@@ -189,6 +189,7 @@ typedef int32_t chpl_bool32;
 typedef int64_t chpl_bool64;
 
 typedef void (*chpl_fn_p)(void*); // function pointer for runtime ftable
+typedef const char *chpl_fn_name;
 typedef int16_t chpl_fn_int_t;    // int type for ftable indexing
 
 // Function table names and information, for VisualDebug use
diff --git a/runtime/include/tasks/atmi/tasks-atmi.h b/runtime/include/tasks/atmi/tasks-atmi.h
new file mode 100644
index 0000000000..3f1497b61d
--- /dev/null
+++ b/runtime/include/tasks/atmi/tasks-atmi.h
@@ -0,0 +1,218 @@
+/**************************************************************************
+*  Copyright 2011 Sandia Corporation. Under the terms of Contract
+*  DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
+*  or on behalf of the U.S. Government. Export of this program may require a
+*  license from the United States Government
+**************************************************************************/
+
+/*
+ * Copyright 2004-2017 Cray Inc.
+ * Other additional copyright holders may be indicated within.
+ *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _tasks_qthreads_h_
+#define _tasks_qthreads_h_
+
+#include "chpl-atmi.h"
+#include "chpl-tasks-prvdata.h"
+#include "chpltypes.h"
+#include "chpl-mem.h"
+#include "qthread.h"
+#include "qthread-chapel.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CHPL_COMM_YIELD_TASK_WHILE_POLLING
+void chpl_task_yield(void);
+
+//
+// Type (and default value) used to communicate task identifiers
+// between C code and Chapel code in the runtime.
+//
+typedef uint64_t chpl_taskID_t;
+#define chpl_nullTaskID ATMI_TASK_HANDLE(0xFFFFFFFFFFFFFFFF) 
+
+//
+// Sync variables
+//
+typedef struct {
+    aligned_t lockers_in;
+    aligned_t lockers_out;
+    uint_fast32_t uncontested_locks;
+    int       is_full;
+    syncvar_t signal_full;
+    syncvar_t signal_empty;
+} chpl_sync_aux_t;
+
+//
+// Task private data
+//
+
+extern
+#ifdef __cplusplus
+"C"
+#endif
+
+volatile int chpl_qthread_done_initializing;
+
+//
+// Task layer private area argument bundle header
+//
+typedef struct {
+  chpl_bool serial_state;
+  chpl_bool countRunning;
+  chpl_bool is_executeOn;
+  int lineno;
+  int filename;
+  c_sublocid_t requestedSubloc;
+  chpl_fn_int_t requested_fid;
+  chpl_fn_p requested_fn;
+  chpl_fn_name requested_fn_name;
+  chpl_taskID_t id;
+} chpl_task_bundle_t;
+
+// Structure of task-local storage
+typedef struct chpl_atmi_tls_s {
+  chpl_task_bundle_t *bundle;
+  // The below fields could move to chpl_task_bundleData_t
+  // That would reduce the size of the task local storage,
+  // but increase the size of executeOn bundles.
+  chpl_task_prvData_t prvdata;
+  /* Reports */
+  int     lock_filename;
+  int     lock_lineno;
+} chpl_atmi_tls_t;
+
+extern pthread_key_t tls_cache;
+
+extern pthread_t chpl_qthread_process_pthread;
+extern pthread_t chpl_qthread_comm_pthread;
+
+extern chpl_atmi_tls_t chpl_qthread_process_tls;
+extern chpl_atmi_tls_t chpl_qthread_comm_task_tls;
+
+#define CHPL_TASK_STD_MODULES_INITIALIZED chpl_task_stdModulesInitialized
+void chpl_task_stdModulesInitialized(void);
+
+extern pthread_t null_thread;
+
+// Wrap qthread_get_tasklocal() and assert that it is always available.
+extern chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void);
+
+#ifdef CHPL_TASK_GET_PRVDATA_IMPL_DECL
+#error "CHPL_TASK_GET_PRVDATA_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_GET_PRVDATA_IMPL_DECL 1
+#endif
+static inline chpl_task_prvData_t* chpl_task_getPrvData(void)
+{
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    if (data) {
+        return &data->prvdata;
+    }
+    assert(data);
+    return NULL;
+}
+
+//
+// Sublocale support
+//
+#ifdef CHPL_TASK_GETSUBLOC_IMPL_DECL
+#error "CHPL_TASK_GETSUBLOC_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_GETSUBLOC_IMPL_DECL 1
+#endif
+static inline
+c_sublocid_t chpl_task_getSubloc(void)
+{
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    if (data) 
+        return data->bundle->requestedSubloc;
+    else 
+        return c_sublocid_any;
+}
+
+#ifdef CHPL_TASK_SETSUBLOC_IMPL_DECL
+#error "CHPL_TASK_SETSUBLOC_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_SETSUBLOC_IMPL_DECL 1
+#endif
+static inline
+void chpl_task_setSubloc(c_sublocid_t subloc)
+{
+    assert(subloc != c_sublocid_none);
+
+    // Only change sublocales if the caller asked for a particular one,
+    // which is not the current one, and we're a (movable) task.
+    //
+    // Note: It's likely that this won't work in all cases where we need
+    //       it.  In particular, we envision needing to move execution
+    //       from sublocale to sublocale while initializing the memory
+    //       layer, in order to get the NUMA domain affinity right for
+    //       the subparts of the heap.  But this will be happening well
+    //       before tasking init and in any case would be done from the
+    //       main thread of execution, which doesn't have a shepherd.
+    //       The code below wouldn't work in that situation.
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    if (data) {
+        data->bundle->requestedSubloc = subloc;
+        printf("Setting ATMI requested subloc to %d\n", subloc);
+    }
+}
+
+#ifdef CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL
+#error "CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_GETREQUESTEDSUBLOC_IMPL_DECL 1
+#endif
+static inline
+c_sublocid_t chpl_task_getRequestedSubloc(void)
+{
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    if (data) {
+        return data->bundle->requestedSubloc;
+    }
+    
+    return c_sublocid_any;
+}
+
+//
+// Can we support remote caching?
+//
+#ifdef CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL
+#error "CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL is already defined!"
+#else
+#define CHPL_TASK_SUPPORTS_REMOTE_CACHE_IMPL_DECL 1
+#endif
+static inline
+int chpl_task_supportsRemoteCache(void) {
+  return CHPL_QTHREAD_SUPPORTS_REMOTE_CACHE;
+}
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // ifndef _tasks_qthreads_h_
+/* vim:set expandtab: */
diff --git a/runtime/src/Makefile.share b/runtime/src/Makefile.share
index 732e06ad1f..8d93b13d3a 100644
--- a/runtime/src/Makefile.share
+++ b/runtime/src/Makefile.share
@@ -18,9 +18,9 @@
 
 ifeq ($(strip $(CHPL_MAKE_TARGET_COMPILER)),hsa)
 HSA_SRCS = \
-	chpl-hsa.c \
+	chpl-atmi.c \
 	chpl-hsa-reducekernels.cl \
-	chpl-hsa-reducehost.c
+	chpl-atmi-reducehost.c
 endif
 
 COMMON_LAUNCHER_SRCS = \
diff --git a/runtime/src/chpl-atmi-reducehost.c b/runtime/src/chpl-atmi-reducehost.c
new file mode 100644
index 0000000000..3d642a7f22
--- /dev/null
+++ b/runtime/src/chpl-atmi-reducehost.c
@@ -0,0 +1,136 @@
+
+#include "chpl-atmi.h"
+#include "chplrt.h"
+#include "chplexit.h"
+#include "chpl-mem.h"
+
+/*enum ReduceOp {
+  MAX,
+  MIN,
+  SUM,
+  PROD,
+  BITAND,
+  BITOR,
+  BITXOR,
+  LOGAND,
+  LOGOR
+  };
+  */
+
+/*
+ * Estimate and schedule the required number of GPU kernels
+ */
+    static inline
+void atmi_sched_reducekernels(size_t count, 
+        void *darray[2], size_t *iter_ct,
+        size_t *items_left)
+{
+    size_t incount, outcount, i, iter, in, out;
+    uint32_t max_num_wkgrps, num_wkgroups, grid_size_x;
+
+    const int num_args = 3;
+    atmi_task_group_t task_group = {1, ATMI_TRUE};
+    ATMI_LPARM(lparm);
+    lparm->group = &task_group;
+    lparm->kernel_id = REDUCTION_GPU_IMPL;
+    lparm->synchronous = ATMI_FALSE;
+    lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
+
+    incount = count;
+    max_num_wkgrps = incount / WKGRP_SIZE;
+    num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE  - 1) / SEQ_CHUNK_SIZE;
+    grid_size_x = num_wkgroups * WKGRP_SIZE;
+    outcount = num_wkgroups;
+    iter = 0;
+    while (grid_size_x > WKGRP_SIZE) {
+        in = (iter & 1);
+        out = (iter & 1) ^ 1;
+
+        void *args[] = {&darray[in], &darray[out], &incount};
+        lparm->gridDim[0] = grid_size_x;
+        lparm->groupDim[0] = WKGRP_SIZE;
+        atmi_task_launch(lparm, reduction_kernel, args);
+
+        iter += 1;
+        incount = outcount;
+        max_num_wkgrps = incount / WKGRP_SIZE;
+        num_wkgroups = (max_num_wkgrps + SEQ_CHUNK_SIZE  - 1) / SEQ_CHUNK_SIZE;
+        grid_size_x = num_wkgroups * WKGRP_SIZE;
+        outcount = num_wkgroups;
+    }
+
+    if (iter > 0) {
+        atmi_task_group_sync(&task_group);
+    }
+
+    (*items_left) = incount;
+    (*iter_ct) = iter;
+}
+
+/*int32_t hsa_reduce_int32(const char *op, int32_t *src, size_t count)
+  {
+  int32_t res;
+  size_t iter, items_left, out, i;
+  int32_t * darray[2];
+  hsa_symbol_info_t * symbol_info;
+  symbol_info = &kernel.symbol_info[0]; //TODO: Remove hardcoded 0 index
+  darray[0] = src;
+  if (0 != chpl_posix_memalign((void **) &darray[1], 64,
+  count * sizeof(int32_t))) {
+  chpl_exit_any(1);
+  }
+
+  hsa_sched_reducekernels(count, symbol_info, (void**)darray,
+  &iter, &items_left);
+
+  res = 0;
+  out = (iter & 1);
+  chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
+  for (i = 0; i < items_left; ++i) res += darray[out][i];
+
+  chpl_free (darray[1]);
+  return res;
+  }*/
+
+int64_t hsa_reduce_int64(const char *op, int64_t *src, size_t count)
+{
+    int64_t res;
+    size_t iter, items_left, out, i;
+    int64_t * darray[2];
+    darray[0] = src;
+    if (0 != chpl_posix_memalign((void **) &darray[1], 64,
+                count * sizeof(int64_t))) {
+        chpl_exit_any(1);
+    }
+
+    atmi_sched_reducekernels(count, (void**)darray,
+            &iter, &items_left);
+
+    res = 0;
+    out = (iter & 1);
+    chpl_msg(2, "HSA: Using CPU to reduce %lu items\n", items_left);
+    for (i = 0; i < items_left; ++i) res += darray[out][i];
+
+    chpl_free (darray[1]);
+    return res;
+}
+
+//FIXME: use the op argument like this to extend this to different ops
+/*if (!strcasecmp(op, "Max"))
+  opType = MAX;
+  else if (!strcasecmp(op, "Min"))
+  opType = MIN;
+  else if (!strcasecmp(op, "Sum"))
+  opType = SUM;
+  else if (!strcasecmp(op, "Product"))
+  opType = PROD;
+  else if (!strcasecmp(op, "LogicalAnd"))
+  opType = LOGAND;
+  else if (!strcasecmp(op, "LogicalOr"))
+  opType = LOGOR;
+  else if (!strcasecmp(op, "BitwiseAnd"))
+  opType = BITAND;
+  else if (!strcasecmp(op, "BitwiseOr"))
+  opType = BITOR;
+  else if (!strcasecmp(op, "BitwiseXor"))
+  opType = BITXOR; */
diff --git a/runtime/src/chpl-atmi.c b/runtime/src/chpl-atmi.c
new file mode 100644
index 0000000000..543db24ec1
--- /dev/null
+++ b/runtime/src/chpl-atmi.c
@@ -0,0 +1,113 @@
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include "chpl-atmi.h"
+#include "chplrt.h"
+#include "chpl-mem.h"
+#include "chplcgfns.h"
+
+#define OUTPUT_ATMI_STATUS(status, msg) \
+{ \
+    if (ATMI_STATUS_SUCCESS != (status)) { \
+        fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
+#msg, status); \
+        atmi_finalize(); \
+        return status; \
+    } \
+}
+
+/**
+ * Initialize the ATMI/HSA runtime
+ */
+int chpl_hsa_initialize(void)
+{
+    char reduce_kernel_filename[1024];
+    char gen_kernel_filename[1024];
+    int arglen = strlen(chpl_executionCommand)+1;
+    char* argCopy = chpl_mem_allocMany(arglen, sizeof(char),
+            CHPL_RT_MD_CFG_ARG_COPY_DATA, 0, 0);
+    char *binName;
+    int cx;
+
+    cx = snprintf(reduce_kernel_filename, 1024,
+#ifdef ROCM
+            "%s/runtime/src/%s/chpl-hsa-reducekernels.hsaco", CHPL_HOME,
+#else
+            "%s/runtime/src/%s/chpl-hsa-reducekernels.o", CHPL_HOME,
+#endif
+            CHPL_RUNTIME_OBJDIR);
+    if (cx < 0 || cx  >= 256) {
+        OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating reduce kernel filename);
+    }
+    strcpy(argCopy, chpl_executionCommand);
+    binName = strtok(argCopy, " ");
+#ifdef ROCM
+    cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.hsaco", binName);
+#else
+    cx = snprintf(gen_kernel_filename, 1024, "%s_gpu.o", binName);
+#endif
+    if (cx < 0 || cx  >= 256) {
+        OUTPUT_ATMI_STATUS(ATMI_STATUS_ERROR, Creating generated kernel filename);
+    }
+    chpl_mem_free(argCopy, 0, 0);
+
+#ifdef ROCM
+    atmi_platform_type_t module_type = AMDGCN;
+#else
+    atmi_platform_type_t module_type = BRIG;
+#endif
+
+    /* FIXME: Create all reduction kernels, not just the int64-sum kernel */
+    const char *modules[2] = {reduce_kernel_filename, gen_kernel_filename};
+    atmi_platform_type_t module_types[2] = {module_type, module_type};
+    atmi_status_t st = atmi_module_register(modules, module_types, 2);
+    OUTPUT_ATMI_STATUS(st, Registering all modules);
+
+    size_t reduction_arg_sizes[] = {sizeof(uint64_t), sizeof(uint64_t), sizeof(uint32_t)};
+    const unsigned int num_reduction_args = sizeof(reduction_arg_sizes)/sizeof(reduction_arg_sizes[0]);
+    atmi_kernel_create_empty(&reduction_kernel, num_reduction_args, reduction_arg_sizes);
+    atmi_kernel_add_gpu_impl(reduction_kernel, "reduce_int64_sum", REDUCTION_GPU_IMPL);
+
+    size_t kernel_arg_sizes[] = {sizeof(uint64_t)}; 
+    const unsigned int num_kernel_args = sizeof(kernel_arg_sizes)/sizeof(kernel_arg_sizes[0]);
+    gpu_kernels = (atmi_kernel_t *)chpl_malloc(sizeof(atmi_kernel_t) * chpl_num_gpu_kernels);
+    for (int64_t i = 0; i < chpl_num_gpu_kernels; ++i) {
+        //FIXME: get the actual kernel name
+        const char *kernel_name = chpl_gpu_kernels[i];
+        atmi_kernel_create_empty(&gpu_kernels[i], num_kernel_args, kernel_arg_sizes);
+        atmi_kernel_add_gpu_impl(gpu_kernels[i], kernel_name, GPU_KERNEL_IMPL);
+    }
+
+    return ATMI_STATUS_SUCCESS;
+}
+
+/**
+ * Release resources used by the base kernels and tear down the HSA structures
+ */
+int hsa_shutdown(void)
+{
+    chpl_free(gpu_kernels);
+    atmi_finalize();
+}
+ 
+/*
+ * Enqueue/execute a kernel
+ */
+void hsa_enqueue_kernel(int kernel_idx, uint32_t wkgrp_size_x,
+        uint32_t wkitem_count_x, void *bundled_args)
+{
+    void *args[] = {&bundled_args}; 
+    ATMI_LPARM_1D(lparm, wkitem_count_x);
+    lparm->groupDim[0] = wkgrp_size_x;
+    lparm->synchronous = ATMI_TRUE;
+
+    lparm->kernel_id = GPU_KERNEL_IMPL;
+    lparm->place = (atmi_place_t)ATMI_PLACE_GPU(0, 0);
+    atmi_task_launch(lparm, gpu_kernels[kernel_idx], args);
+}
+
diff --git a/runtime/src/tasks/atmi/Makefile b/runtime/src/tasks/atmi/Makefile
new file mode 100644
index 0000000000..c78c8f6377
--- /dev/null
+++ b/runtime/src/tasks/atmi/Makefile
@@ -0,0 +1,42 @@
+# Copyright 2004-2017 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+RUNTIME_ROOT = ../../..
+RUNTIME_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS)
+
+ifndef CHPL_MAKE_HOME
+export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/..
+endif
+
+#
+# standard header
+#
+include $(RUNTIME_ROOT)/make/Makefile.runtime.head
+
+TASKS_OBJDIR = $(RUNTIME_OBJDIR)
+include Makefile.share
+
+TARGETS = $(TASKS_OBJS)
+
+include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules
+
+FORCE:
+
+#
+# standard footer
+#
+include $(RUNTIME_ROOT)/make/Makefile.runtime.foot
diff --git a/runtime/src/tasks/atmi/Makefile.include b/runtime/src/tasks/atmi/Makefile.include
new file mode 100644
index 0000000000..35b1225bf7
--- /dev/null
+++ b/runtime/src/tasks/atmi/Makefile.include
@@ -0,0 +1,25 @@
+# Copyright 2004-2017 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TASKS_SUBDIR = src/tasks/$(CHPL_MAKE_TASKS)
+
+TASKS_OBJDIR = $(RUNTIME_BUILD)/$(TASKS_SUBDIR)
+
+ALL_SRCS += $(CURDIR)/$(TASKS_SUBDIR)/*.c \
+	$(RUNTIME_INCLUDE_ROOT)/tasks/$(CHPL_MAKE_TASKS)/*.h
+
+include $(RUNTIME_ROOT)/$(TASKS_SUBDIR)/Makefile.share
diff --git a/runtime/src/tasks/atmi/Makefile.share b/runtime/src/tasks/atmi/Makefile.share
new file mode 100644
index 0000000000..74c39c92a2
--- /dev/null
+++ b/runtime/src/tasks/atmi/Makefile.share
@@ -0,0 +1,23 @@
+# Copyright 2004-2017 Cray Inc.
+# Other additional copyright holders may be indicated within.
+# 
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+# 
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TASKS_SRCS = tasks-$(CHPL_MAKE_TASKS).c
+
+SVN_SRCS = $(TASKS_SRCS)
+SRCS = $(SVN_SRCS)
+
+TASKS_OBJS = $(TASKS_SRCS:%.c=$(TASKS_OBJDIR)/%.o)
diff --git a/runtime/src/tasks/atmi/tasks-atmi.c b/runtime/src/tasks/atmi/tasks-atmi.c
new file mode 100644
index 0000000000..9194494b18
--- /dev/null
+++ b/runtime/src/tasks/atmi/tasks-atmi.c
@@ -0,0 +1,1316 @@
+//
+// Qthreads implementation of Chapel tasking interface
+//
+// Copyright 2011 Sandia Corporation. Under the terms of Contract
+// DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
+// or on behalf of the U.S. Government. Export of this program may require a
+// license from the United States Government
+//
+
+/*
+ * Copyright 2004-2017 Cray Inc.
+ * Other additional copyright holders may be indicated within.
+ *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// For SVID definitions (setenv)
+#define _SVID_SOURCE
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "chplrt.h"
+
+#include "arg.h"
+#include "error.h"
+#include "chplcgfns.h"
+#include "chpl-comm.h"
+#include "chplexit.h"
+#include "chpl-locale-model.h"
+#include "chpl-mem.h"
+#include "chplsys.h"
+#include "chpl-linefile-support.h"
+#include "chpl-tasks.h"
+#include "chpl-atomics.h"
+#include "chpl-tasks-callbacks-internal.h"
+//#include "tasks-qthreads.h"
+#include "tasks-atmi.h"
+#include <atmi_runtime.h>
+
+#include "qthread.h"
+#include "qthread/qtimer.h"
+#include "qthread-chapel.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <math.h>
+
+// FIXME: Good idea to recycle the task groups, so this will limit 
+// the depth of nested coforall s and cobegins. Fix to use an ATMI
+// environment variable.
+#define max_num_task_groups 8
+#define max_num_cpu_kernels 4096
+int cpu_kernels_initialized[max_num_cpu_kernels] = {0}; 
+atmi_kernel_t cpu_kernels[max_num_cpu_kernels];
+atmi_kernel_t dummy_kernel;
+static uint_least64_t atmi_tg_id = 0;
+
+
+#define OUTPUT_ATMI_STATUS(status, msg) \
+{ \
+    if (ATMI_STATUS_SUCCESS != (status)) { \
+        fprintf(stderr, "ATMI support: %s failed, error code: 0x%x\n", \
+#msg, status); \
+        atmi_finalize(); \
+        return status; \
+    } \
+}
+
+//#define SUPPORT_BLOCKREPORT
+//#define SUPPORT_TASKREPORT
+
+#ifdef CHAPEL_PROFILE
+# define PROFILE_INCR(counter,count) do { (void)qthread_incr(&counter,count); } while (0)
+
+/* Tasks */
+static aligned_t profile_task_yield = 0;
+static aligned_t profile_task_addToTaskList = 0;
+static aligned_t profile_task_executeTasksInList = 0;
+static aligned_t profile_task_taskCallFTable = 0;
+static aligned_t profile_task_startMovedTask = 0;
+static aligned_t profile_task_getId = 0;
+static aligned_t profile_task_sleep = 0;
+static aligned_t profile_task_getSerial = 0;
+static aligned_t profile_task_setSerial = 0;
+static aligned_t profile_task_getCallStackSize = 0;
+/* Sync */
+static aligned_t profile_sync_lock= 0;
+static aligned_t profile_sync_unlock= 0;
+static aligned_t profile_sync_waitFullAndLock= 0;
+static aligned_t profile_sync_waitEmptyAndLock= 0;
+static aligned_t profile_sync_markAndSignalFull= 0;
+static aligned_t profile_sync_markAndSignalEmpty= 0;
+static aligned_t profile_sync_isFull= 0;
+static aligned_t profile_sync_initAux= 0;
+static aligned_t profile_sync_destroyAux= 0;
+
+static void profile_print(void)
+{
+    /* Tasks */
+    fprintf(stderr, "task yield: %lu\n", (unsigned long)profile_task_yield);
+    fprintf(stderr, "task addToTaskList: %lu\n", (unsigned long)profile_task_addToTaskList);
+    fprintf(stderr, "task executeTasksInList: %lu\n", (unsigned long)profile_task_executeTasksInList);
+    fprintf(stderr, "task taskCallFTable: %lu\n", (unsigned long)profile_task_taskCallFTable);
+    fprintf(stderr, "task startMovedTask: %lu\n", (unsigned long)profile_task_startMovedTask);
+    fprintf(stderr, "task getId: %lu\n", (unsigned long)profile_task_getId);
+    fprintf(stderr, "task sleep: %lu\n", (unsigned long)profile_task_sleep);
+    fprintf(stderr, "task getSerial: %lu\n", (unsigned long)profile_task_getSerial);
+    fprintf(stderr, "task setSerial: %lu\n", (unsigned long)profile_task_setSerial);
+    fprintf(stderr, "task getCallStackSize: %lu\n", (unsigned long)profile_task_getCallStackSize);
+    /* Sync */
+    fprintf(stderr, "sync lock: %lu\n", (unsigned long)profile_sync_lock);
+    fprintf(stderr, "sync unlock: %lu\n", (unsigned long)profile_sync_unlock);
+    fprintf(stderr, "sync waitFullAndLock: %lu\n", (unsigned long)profile_sync_waitFullAndLock);
+    fprintf(stderr, "sync waitEmptyAndLock: %lu\n", (unsigned long)profile_sync_waitEmptyAndLock);
+    fprintf(stderr, "sync markAndSignalFull: %lu\n", (unsigned long)profile_sync_markAndSignalFull);
+    fprintf(stderr, "sync markAndSignalEmpty: %lu\n", (unsigned long)profile_sync_markAndSignalEmpty);
+    fprintf(stderr, "sync isFull: %lu\n", (unsigned long)profile_sync_isFull);
+    fprintf(stderr, "sync initAux: %lu\n", (unsigned long)profile_sync_initAux);
+    fprintf(stderr, "sync destroyAux: %lu\n", (unsigned long)profile_sync_destroyAux);
+}
+#else
+# define PROFILE_INCR(counter,count)
+#endif /* CHAPEL_PROFILE */
+
+//
+// Startup and shutdown control.  The mutex is used just for the side
+// effect of its (very portable) memory fence.
+//
+volatile int chpl_qthread_done_initializing;
+static syncvar_t canexit = SYNCVAR_STATIC_EMPTY_INITIALIZER;
+static volatile int done_finalizing;
+static pthread_mutex_t done_init_final_mux = PTHREAD_MUTEX_INITIALIZER;
+
+// Make qt env sizes uniform. Same as qt, but they use the literal everywhere
+#define QT_ENV_S 100
+
+// aka chpl_task_list_p
+struct chpl_task_list {
+    chpl_fn_p        fun;
+    void            *arg;
+    int32_t          filename;
+    int              lineno;
+    chpl_task_list_p next;
+};
+
+static aligned_t next_task_id = 1;
+
+pthread_t chpl_qthread_process_pthread;
+pthread_t chpl_qthread_comm_pthread;
+
+chpl_task_bundle_t chpl_qthread_process_bundle = {
+                                   .serial_state = false,
+                                   .countRunning = false,
+                                   .is_executeOn = false,
+                                   .lineno = 0,
+                                   .filename = CHPL_FILE_IDX_MAIN_TASK,
+                                   .requestedSubloc = c_sublocid_any_val,
+                                   .requested_fid = FID_NONE,
+                                   .requested_fn = NULL,
+                                   .id = chpl_nullTaskID };
+
+chpl_task_bundle_t chpl_qthread_comm_task_bundle = {
+                                   .serial_state = false,
+                                   .countRunning = false,
+                                   .is_executeOn = false,
+                                   .lineno = 0,
+                                   .filename = CHPL_FILE_IDX_COMM_TASK,
+                                   .requestedSubloc = c_sublocid_any_val,
+                                   .requested_fid = FID_NONE,
+                                   .requested_fn = NULL,
+                                   .id = chpl_nullTaskID };
+
+chpl_atmi_tls_t chpl_qthread_process_tls = {
+                               .bundle = &chpl_qthread_process_bundle,
+                               .lock_filename = 0,
+                               .lock_lineno = 0 };
+
+chpl_atmi_tls_t chpl_qthread_comm_task_tls = {
+                               .bundle = &chpl_qthread_comm_task_bundle,
+                               .lock_filename = 0,
+                               .lock_lineno = 0 };
+
+//
+// chpl_atmi_get_tasklocal() is in tasks-qthreads.h
+//
+
+pthread_key_t tls_cache;
+
+static syncvar_t exit_ret = SYNCVAR_STATIC_EMPTY_INITIALIZER;
+
+static volatile chpl_bool canCountRunningTasks = false;
+
+void chpl_task_yield(void)
+{
+    PROFILE_INCR(profile_task_yield,1);
+    /*if (qthread_shep() == NO_SHEPHERD) {
+        sched_yield();
+    } else {
+        qthread_yield();
+    }*/
+}
+
+// Sync variables
+void chpl_sync_lock(chpl_sync_aux_t *s)
+{
+    aligned_t l;
+    chpl_bool uncontested_lock = true;
+
+    PROFILE_INCR(profile_sync_lock, 1);
+
+    //
+    // To prevent starvation due to never switching away from a task that is
+    // spinning while doing readXX() on a sync variable, yield if this sync var
+    // has a "lot" of uncontested locks. Note that the uncontested locks do not
+    // have to be consecutive. Also note that the number of uncontested locks
+    // is a lossy counter. Currently a "lot" is defined as ~100 uncontested
+    // locks, with care taken to not yield on the first uncontested lock.
+    //
+    // If real qthreads sync vars were used, it's possible this wouldn't be
+    // needed.
+    //
+
+    l = qthread_incr(&s->lockers_in, 1);
+
+    while (l != s->lockers_out) {
+        uncontested_lock = false;
+        qthread_yield();
+    }
+
+    if (uncontested_lock) {
+        if ((++s->uncontested_locks & 0x5F) == 0) {
+            qthread_yield();
+        }
+    }
+}
+
+void chpl_sync_unlock(chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_unlock, 1);
+
+    qthread_incr(&s->lockers_out, 1);
+}
+
+inline chpl_atmi_tls_t* chpl_atmi_get_tasklocal(void)
+{
+    chpl_atmi_tls_t* tls = NULL;
+
+    pthread_t me = pthread_self();
+    if (pthread_equal(me, chpl_qthread_comm_pthread))
+        tls = &chpl_qthread_comm_task_tls;
+    else if (pthread_equal(me, chpl_qthread_process_pthread))
+        tls = &chpl_qthread_process_tls;
+    else {
+        atmi_task_handle_t t = get_atmi_task_handle();
+        if(t != ATMI_NULL_TASK_HANDLE) {
+            tls = (chpl_atmi_tls_t *)pthread_getspecific(tls_cache);
+            if(tls == NULL) {
+                // FIXME: when to free?
+                tls = (chpl_atmi_tls_t *)chpl_mem_alloc(sizeof(chpl_atmi_tls_t),
+                                               CHPL_RT_MD_THREAD_PRV_DATA,
+                                               0, 0);
+                pthread_setspecific(tls_cache, tls);
+            }
+        }
+        assert(tls != NULL);
+    }
+
+    return tls;
+}
+
+static inline void about_to_block(int32_t  lineno,
+                                  int32_t filename)
+{
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    assert(data);
+
+    data->lock_lineno   = lineno;
+    data->lock_filename = filename;
+}
+
+void chpl_sync_waitFullAndLock(chpl_sync_aux_t *s,
+                               int32_t          lineno,
+                               int32_t         filename)
+{
+    PROFILE_INCR(profile_sync_waitFullAndLock, 1);
+
+    if (blockreport) { about_to_block(lineno, filename); }
+    chpl_sync_lock(s);
+    while (s->is_full == 0) {
+        chpl_sync_unlock(s);
+        qthread_syncvar_readFE(NULL, &(s->signal_full));
+        chpl_sync_lock(s);
+    }
+}
+
+void chpl_sync_waitEmptyAndLock(chpl_sync_aux_t *s,
+                                int32_t          lineno,
+                                int32_t         filename)
+{
+    PROFILE_INCR(profile_sync_waitEmptyAndLock, 1);
+
+    if (blockreport) { about_to_block(lineno, filename); }
+    chpl_sync_lock(s);
+    while (s->is_full != 0) {
+        chpl_sync_unlock(s);
+        qthread_syncvar_readFE(NULL, &(s->signal_empty));
+        chpl_sync_lock(s);
+    }
+}
+
+void chpl_sync_markAndSignalFull(chpl_sync_aux_t *s)         // and unlock
+{
+    PROFILE_INCR(profile_sync_markAndSignalFull, 1);
+
+    qthread_syncvar_fill(&(s->signal_full));
+    s->is_full = 1;
+    chpl_sync_unlock(s);
+}
+
+void chpl_sync_markAndSignalEmpty(chpl_sync_aux_t *s)         // and unlock
+{
+    PROFILE_INCR(profile_sync_markAndSignalEmpty, 1);
+
+    qthread_syncvar_fill(&(s->signal_empty));
+    s->is_full = 0;
+    chpl_sync_unlock(s);
+}
+
+chpl_bool chpl_sync_isFull(void            *val_ptr,
+                           chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_isFull, 1);
+
+    return s->is_full;
+}
+
+void chpl_sync_initAux(chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_initAux, 1);
+
+    s->lockers_in   = 0;
+    s->lockers_out  = 0;
+    s->is_full      = 0;
+    s->signal_empty = SYNCVAR_EMPTY_INITIALIZER;
+    s->signal_full  = SYNCVAR_EMPTY_INITIALIZER;
+}
+
+void chpl_sync_destroyAux(chpl_sync_aux_t *s)
+{
+    PROFILE_INCR(profile_sync_destroyAux, 1);
+}
+
+#ifdef SUPPORT_BLOCKREPORT
+static void chapel_display_thread(qt_key_t     addr,
+                                  qthread_f    f,
+                                  void        *arg,
+                                  void        *retloc,
+                                  unsigned int thread_id,
+                                  void        *tls,
+                                  void        *callarg)
+{
+    chpl_atmi_tls_t *rep = (chpl_atmi_tls_t *)tls;
+
+    if (rep) {
+        if ((rep->lock_lineno > 0) && rep->lock_filename) {
+          fprintf(stderr, "Waiting at: %s:%zu (task %s:%zu)\n",
+                  chpl_lookupFilename(rep->lock_filename), rep->lock_lineno,
+                  chpl_lookupFilename(rep->chpl_data.task_filename),
+                  rep->chpl_data.task_lineno);
+        } else if (rep->lock_lineno == 0 && rep->lock_filename) {
+          fprintf(stderr,
+                  "Waiting for more work (line 0? file:%s) (task %s:%zu)\n",
+                  chpl_lookupFilename(rep->lock_filename),
+                  chpl_lookupFilename(rep->chpl_data.task_filename),
+                  rep->chpl_data.task_lineno);
+        } else if (rep->lock_lineno == 0) {
+          fprintf(stderr,
+                  "Waiting for dependencies (uninitialized task %s:%zu)\n",
+                  chpl_lookupFilename(rep->chpl_data.task_filename),
+                  rep->chpl_data.task_lineno);
+        }
+        fflush(stderr);
+    }
+}
+
+static void report_locked_threads(void)
+{
+    qthread_feb_callback(chapel_display_thread, NULL);
+    qthread_syncvar_callback(chapel_display_thread, NULL);
+}
+#endif  // SUPPORT_BLOCKREPORT
+
+static void SIGINT_handler(int sig)
+{
+    signal(sig, SIG_IGN);
+
+    if (blockreport) {
+#ifdef SUPPORT_BLOCKREPORT
+        report_locked_threads();
+#else
+        fprintf(stderr,
+                "Blockreport is currently unsupported by the qthreads "
+                "tasking layer.\n");
+#endif
+    }
+
+    if (taskreport) {
+#ifdef SUPPORT_TASKREPORT
+        report_all_tasks();
+#else
+        fprintf(stderr, "Taskreport is currently unsupported by the qthreads tasking layer.\n");
+#endif
+    }
+
+    chpl_exit_any(1);
+}
+
+// Tasks
+
+static void *initializer(void *junk)
+{
+    qthread_initialize();
+    (void) pthread_mutex_lock(&done_init_final_mux);  // implicit memory fence
+    chpl_qthread_done_initializing = 1;
+    (void) pthread_mutex_unlock(&done_init_final_mux);
+
+    qthread_syncvar_readFF(NULL, &canexit);
+
+    qthread_finalize();
+    (void) pthread_mutex_lock(&done_init_final_mux);  // implicit memory fence
+    done_finalizing = 1;
+    (void) pthread_mutex_unlock(&done_init_final_mux);
+
+    return NULL;
+}
+
+//
+// Qthreads environment helper functions.
+//
+
+static char* chpl_qt_getenv_str(const char* var) {
+    char name[100];
+    char* ev;
+
+    snprintf(name, sizeof(name), "QT_%s", var);
+    if ((ev = getenv(name)) == NULL) {
+        snprintf(name, sizeof(name), "QTHREAD_%s", var);
+        ev = getenv(name);
+    }
+
+    return ev;
+}
+
+static unsigned long int chpl_qt_getenv_num(const char* var,
+                                            unsigned long int default_val) {
+    char* ev;
+    unsigned long int ret_val = default_val;
+
+    if ((ev = chpl_qt_getenv_str(var)) != NULL) {
+        unsigned long int val;
+        if (sscanf(ev, "%lu", &val) == 1)
+            ret_val = val;
+    }
+
+    return ret_val;
+}
+
+// Helper function to set a qthreads env var. This is meant to mirror setenv
+// functionality, but qthreads has two environment variables for every setting:
+// a QT_ and a QTHREAD_ version. We often forget to think about both so this
+// wraps the overriding logic. In verbose mode it prints out if we overrode
+// values, or if we were prevented from setting values because they existed
+// (and override was 0.)
+static void chpl_qt_setenv(const char* var, const char* val,
+                           int32_t override) {
+    char      qt_env[QT_ENV_S]  = { 0 };
+    char      qthread_env[QT_ENV_S] = { 0 };
+    char      *qt_val;
+    char      *qthread_val;
+    chpl_bool eitherSet = false;
+
+    strncpy(qt_env, "QT_", sizeof(qt_env));
+    strncat(qt_env, var, sizeof(qt_env) - 1);
+
+    strncpy(qthread_env, "QTHREAD_", sizeof(qthread_env));
+    strncat(qthread_env, var, sizeof(qthread_env) - 1);
+
+    qt_val = getenv(qt_env);
+    qthread_val = getenv(qthread_env);
+    eitherSet = (qt_val != NULL || qthread_val != NULL);
+
+    if (override || !eitherSet) {
+        if (verbosity >= 2
+            && override
+            && eitherSet
+            && strcmp(val, (qt_val != NULL) ? qt_val : qthread_val) != 0) {
+            printf("QTHREADS: Overriding the value of %s and %s "
+                   "with %s\n", qt_env, qthread_env, val);
+        }
+        (void) setenv(qt_env, val, 1);
+        (void) setenv(qthread_env, val, 1);
+    } else if (verbosity >= 2) {
+        char* set_env = NULL;
+        char* set_val = NULL;
+        if (qt_val != NULL) {
+            set_env = qt_env;
+            set_val = qt_val;
+        } else {
+            set_env = qthread_env;
+            set_val = qthread_val;
+        }
+        if (strcmp(val, set_val) != 0)
+          printf("QTHREADS: Not setting %s to %s because %s is set to %s and "
+                 "overriding was not requested\n", qt_env, val, set_env, set_val);
+    }
+}
+
+static void chpl_qt_unsetenv(const char* var) {
+  char name[100];
+
+  snprintf(name, sizeof(name), "QT_%s", var);
+  (void) unsetenv(name);
+
+  snprintf(name, sizeof(name), "QTHREAD_%s", var);
+  (void) unsetenv(name);
+}
+
+// Determine the number of workers based on environment settings. If a user set
+// HWPAR, they are saying they want to use HWPAR many workers, but let the
+// runtime figure out the details. If they explicitly set NUM_SHEPHERDS and/or
+// NUM_WORKERS_PER_SHEPHERD then they must have specific reasons for doing so.
+// Returns 0 if no Qthreads env vars related to the number of threads were set,
+// what HWPAR was set to if it was set, or -1 if NUM_SHEP and/or NUM_WPS were
+// set since we can't figure out before Qthreads init what this will actually
+// turn into without duplicating Qthreads logic (-1 is a sentinel for don't
+// adjust the values, and give them as is to Qthreads.)
+static int32_t chpl_qt_getenv_num_workers(void) {
+    int32_t  hwpar;
+    int32_t  num_wps;
+    int32_t  num_sheps;
+
+    hwpar = (int32_t) chpl_qt_getenv_num("HWPAR", 0);
+    num_wps = (int32_t) chpl_qt_getenv_num("NUM_WORKERS_PER_SHEPHERD", 0);
+    num_sheps = (int32_t) chpl_qt_getenv_num("NUM_SHEPHERDS", 0);
+
+    if (hwpar) {
+        return hwpar;
+    } else if (num_wps || num_sheps) {
+        return -1;
+    }
+
+    return 0;
+}
+
+
+// Sets up and returns the amount of hardware parallelism to use, limited to
+// maxThreads. Returns -1 if we did not setup parallelism because a user
+// explicitly requested a specific layout from qthreads.
+static int32_t setupAvailableParallelism(int32_t maxThreads) {
+    int32_t   numThreadsPerLocale;
+    int32_t   qtEnvThreads;
+    int32_t   hwpar;
+    char      newenv_workers[QT_ENV_S] = { 0 };
+
+    // Experience has shown that Qthreads generally performs best with
+    // num_workers = numCores (and thus worker_unit = core) but if the user has
+    // explicitly requested more threads through the chapel or Qthread env
+    // vars, we override the default.
+    numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale();
+    qtEnvThreads = chpl_qt_getenv_num_workers();
+    hwpar = 0;
+
+    // User set chapel level env var (CHPL_RT_NUM_THREADS_PER_LOCALE)
+    // This is limited to the number of logical CPUs on the node.
+    if (numThreadsPerLocale != 0) {
+        int32_t numPUsPerLocale;
+
+        hwpar = numThreadsPerLocale;
+
+        numPUsPerLocale = chpl_getNumLogicalCpus(true);
+        if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) {
+            if (verbosity > 0) {
+                printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d "
+                       "to prevent oversubscription of the system.\n",
+                       hwpar, numPUsPerLocale);
+            }
+
+            // Do not oversubscribe the system, use all available resources.
+            hwpar = numPUsPerLocale;
+        }
+    }
+    // User set qthreads level env var
+    // (HWPAR or (NUM_SHEPHERDS and NUM_WORKERS_PER_SHEPHERD))
+    else if (qtEnvThreads != 0) {
+        hwpar = qtEnvThreads;
+    }
+    // User did not set chapel or qthreads vars -- our default
+    else {
+        hwpar = chpl_getNumPhysicalCpus(true);
+    }
+
+    // hwpar will only be <= 0 if the user set QT_NUM_SHEPHERDS and/or
+    // QT_NUM_WORKERS_PER_SHEPHERD in which case we assume as "expert" user and
+    // don't impose any thread limits or set worker_unit.
+    if (hwpar > 0) {
+        // Limit the parallelism to the maximum imposed by the comm layer.
+        if (0 < maxThreads && maxThreads < hwpar) {
+            hwpar = maxThreads;
+        }
+
+        // If there is more parallelism requested than the number of cores, set the
+        // worker unit to pu, otherwise core.
+        if (hwpar > chpl_getNumPhysicalCpus(true)) {
+          chpl_qt_setenv("WORKER_UNIT", "pu", 0);
+        } else {
+          chpl_qt_setenv("WORKER_UNIT", "core", 0);
+        }
+
+        // Unset relevant Qthreads environment variables.
+        chpl_qt_unsetenv("HWPAR");
+        chpl_qt_unsetenv("NUM_SHEPHERDS");
+        chpl_qt_unsetenv("NUM_WORKERS_PER_SHEPHERD");
+
+        snprintf(newenv_workers, sizeof(newenv_workers), "%i", (int)hwpar);
+        if (CHPL_QTHREAD_SCHEDULER_ONE_WORKER_PER_SHEPHERD) {
+            chpl_qt_setenv("NUM_SHEPHERDS", newenv_workers, 1);
+            chpl_qt_setenv("NUM_WORKERS_PER_SHEPHERD", "1", 1);
+        } else {
+            chpl_qt_setenv("HWPAR", newenv_workers, 1);
+        }
+    }
+    return hwpar;
+}
+
+static void setupCallStacks(int32_t hwpar) {
+    size_t callStackSize;
+
+    // If the user compiled with no stack checks (either explicitly or
+    // implicitly) turn off qthread guard pages. TODO there should also be a
+    // chpl level env var backing this at runtime (can be the same var.)
+    // Also turn off guard pages if the heap page size isn't the same as
+    // the system page size, because when that's the case we can reliably
+    // make the guard pages un-referenceable.  (This typically arises when
+    // the heap is on hugepages, as is often the case on Cray systems.)
+    //
+    // Note that we won't override an explicit setting of QT_GUARD_PAGES
+    // in the former case, but we do in the latter case.
+    if (CHPL_STACK_CHECKS == 0) {
+        chpl_qt_setenv("GUARD_PAGES", "false", 0);
+    }
+    else if (chpl_getHeapPageSize() != chpl_getSysPageSize()) {
+        chpl_qt_setenv("GUARD_PAGES", "false", 1);
+    }
+
+    // Precedence (high-to-low):
+    // 1) Chapel environment (CHPL_RT_CALL_STACK_SIZE)
+    // 2) QTHREAD_STACK_SIZE
+    // 3) Chapel default
+    if ((callStackSize = chpl_task_getEnvCallStackSize()) > 0 ||
+        (chpl_qt_getenv_num("STACK_SIZE", 0) == 0 &&
+         (callStackSize = chpl_task_getDefaultCallStackSize()) > 0)) {
+        char newenv_stack[QT_ENV_S];
+        snprintf(newenv_stack, sizeof(newenv_stack), "%zu", callStackSize);
+        chpl_qt_setenv("STACK_SIZE", newenv_stack, 1);
+
+        // Qthreads sets up memory pools expecting the item_size to be small.
+        // Stacks are allocated in this manner too, but our default stack size
+        // is quite large, so we limit the max memory allocated for a pool. We
+        // default to a multiple of callStackSize and hwpar, with the thought
+        // that available memory is generally proportional to the amount of
+        // parallelism. For some architectures, this isn't true so we set a max
+        // upper bound. And if the callStackSize is small, we don't want to
+        // limit all qthreads pool allocations to a small value, so we have a
+        // lower bound as well. Note that qthread stacks are slightly larger
+        // than specified to store a book keeping structure and possibly guard
+        // pages, so we thrown an extra MB.
+        if (hwpar > 0) {
+            const size_t oneMB = 1024 * 1024;
+            const size_t allocSizeLowerBound =  33 * oneMB;
+            const size_t allocSizeUpperBound = 513 * oneMB;
+            size_t maxPoolAllocSize;
+            char newenv_alloc[QT_ENV_S];
+
+            maxPoolAllocSize = 2 * hwpar * callStackSize + oneMB;
+            if (maxPoolAllocSize < allocSizeLowerBound) {
+                maxPoolAllocSize = allocSizeLowerBound;
+            } else if (maxPoolAllocSize > allocSizeUpperBound) {
+                maxPoolAllocSize = allocSizeUpperBound;
+            }
+            snprintf(newenv_alloc, sizeof(newenv_alloc), "%zu", maxPoolAllocSize);
+            chpl_qt_setenv("MAX_POOL_ALLOC_SIZE", newenv_alloc, 0);
+        }
+    }
+}
+
+static void setupTasklocalStorage(void) {
+    unsigned long int tasklocal_size;
+    char newenv[QT_ENV_S];
+
+    // Make sure Qthreads knows how much space we need for per-task
+    // local storage.
+    tasklocal_size = chpl_qt_getenv_num("TASKLOCAL_SIZE", 0);
+    if (tasklocal_size < sizeof(chpl_atmi_tls_t)) {
+        snprintf(newenv, sizeof(newenv), "%zu", sizeof(chpl_atmi_tls_t));
+        chpl_qt_setenv("TASKLOCAL_SIZE", newenv, 1);
+    }
+}
+
+static void setupWorkStealing(void) {
+    // In our experience the current work stealing implementation hurts
+    // performance, so disable it. Note that we don't override, so a user could
+    // try working stealing out by setting {QT,QTHREAD}_STEAL_RATIO. Also note
+    // that not all schedulers support work stealing, but it doesn't hurt to
+    // set this env var for those configs anyways.
+    chpl_qt_setenv("STEAL_RATIO", "0", 0);
+}
+
+static aligned_t chapel_wrapper(void *arg);
+static aligned_t main_wrapper(void *arg);
+static aligned_t dummy_wrapper();
+// If we stored chpl_taskID_t in chpl_task_bundleData_t,
+// this struct and the following function may not be necessary.
+typedef void (*main_ptr_t)(void);
+typedef struct {
+  chpl_task_bundle_t arg;
+  main_ptr_t chpl_main;
+} main_wrapper_bundle_t;
+
+void chpl_task_init(void)
+{
+    atmi_status_t st = atmi_init(ATMI_DEVTYPE_ALL);
+    if(st != ATMI_STATUS_SUCCESS) return;
+
+    g_machine = atmi_machine_get_info();
+    
+    pthread_key_create(&tls_cache, NULL);
+    //g_num_cpu_kernels = sizeof(chpl_ftable)/sizeof(chpl_ftable[0]);
+    size_t main_arg_size = sizeof(main_wrapper_bundle_t);
+    atmi_kernel_create_empty(&main_kernel, 1, &main_arg_size);
+    atmi_kernel_add_cpu_impl(main_kernel, (atmi_generic_fp)main_wrapper, CPU_FUNCTION_IMPL);
+    atmi_kernel_create_empty(&dummy_kernel, 0, NULL);
+    atmi_kernel_add_cpu_impl(dummy_kernel, (atmi_generic_fp)dummy_wrapper, CPU_FUNCTION_IMPL);
+
+    // this increment needed to let the main task not be treated as an ATMI task
+    // because we directly launch the main task with the main thread and dont treat 
+    // it as an ATMI task. This increment fools the rest of the runtime to think that
+    // the main task is an ATMI task, but it is in fact not an ATMI task.
+    int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed);
+    atmi_task_handle_t dummy_handle = atmi_task_create(dummy_kernel);
+    int32_t   commMaxThreads;
+    int32_t   hwpar;
+    pthread_t initer;
+    pthread_attr_t pAttr;
+
+    chpl_qthread_process_pthread = pthread_self();
+    chpl_qthread_process_bundle.id = qthread_incr(&next_task_id, 1);
+
+    commMaxThreads = chpl_comm_getMaxThreads();
+
+    // Set up hardware parallelism, the stack size and stack guards,
+    // tasklocal storage, and work stealing
+    hwpar = setupAvailableParallelism(commMaxThreads);
+    setupCallStacks(hwpar);
+    setupTasklocalStorage();
+    setupWorkStealing();
+
+    if (verbosity >= 2) { chpl_qt_setenv("INFO", "1", 0); }
+
+    // Initialize qthreads
+    pthread_attr_init(&pAttr);
+    pthread_attr_setdetachstate(&pAttr, PTHREAD_CREATE_DETACHED);
+    pthread_create(&initer, &pAttr, initializer, NULL);
+    while (chpl_qthread_done_initializing == 0)
+        sched_yield();
+
+    // Now that Qthreads is up and running, do a sanity check and make sure
+    // that the number of workers is less than any comm layer limit. This is
+    // mainly need for the case where a user set QT_NUM_SHEPHERDS and/or
+    // QT_NUM_WORKERS_PER_SHEPHERD in which case we don't impose any limits on
+    // the number of threads qthreads creates beforehand
+    assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads);
+
+    if (blockreport || taskreport) {
+        if (signal(SIGINT, SIGINT_handler) == SIG_ERR) {
+            perror("Could not register SIGINT handler");
+        }
+    }
+}
+
+void chpl_task_exit(void)
+{
+    //for(int i = 0; i < g_num_cpu_kernels; i++) {
+    for(int i = 0; i < max_num_cpu_kernels; i++) {
+        if(cpu_kernels_initialized[i]) 
+            atmi_kernel_release(cpu_kernels[i]);
+    }
+    atmi_kernel_release(dummy_kernel);
+    atmi_kernel_release(main_kernel);
+    atmi_finalize();
+#ifdef CHAPEL_PROFILE
+    profile_print();
+#endif /* CHAPEL_PROFILE */
+
+    pthread_key_delete(tls_cache);
+    if (qthread_shep() == NO_SHEPHERD) {
+        /* sometimes, tasking is told to shutdown even though it hasn't been
+         * told to start yet */
+        if (chpl_qthread_done_initializing == 1) {
+            qthread_syncvar_fill(&canexit);
+            while (done_finalizing == 0)
+                sched_yield();
+        }
+    } else {
+        qthread_syncvar_fill(&exit_ret);
+    }
+}
+
+static inline void wrap_callbacks(chpl_task_cb_event_kind_t event_kind,
+                                  chpl_task_bundle_t* bundle) {
+    if (chpl_task_have_callbacks(event_kind)) {
+        if (bundle->id == chpl_nullTaskID)
+            bundle->id = qthread_incr(&next_task_id, 1);
+        chpl_task_do_callbacks(event_kind,
+                               bundle->requested_fid,
+                               bundle->filename,
+                               bundle->lineno,
+                               bundle->id,
+                               bundle->is_executeOn);
+    }
+}
+
+
+static aligned_t dummy_wrapper() {} 
+
+static aligned_t main_wrapper(void *arg)
+{
+    chpl_atmi_tls_t         *tls = chpl_atmi_get_tasklocal();
+    main_wrapper_bundle_t *m_bundle = (main_wrapper_bundle_t*) arg;
+    chpl_task_bundle_t      *bundle = &m_bundle->arg;
+    chpl_atmi_tls_t         pv = {.bundle = bundle};
+
+    *tls = pv;
+
+    // main call doesn't need countRunning / chpl_taskRunningCntInc
+
+    wrap_callbacks(chpl_task_cb_event_kind_begin, bundle);
+
+    (m_bundle->chpl_main)();
+
+    wrap_callbacks(chpl_task_cb_event_kind_end, bundle);
+
+    return 0;
+}
+
+static aligned_t chapel_wrapper(void *arg)
+{
+    chpl_atmi_tls_t    *tls = chpl_atmi_get_tasklocal();
+    chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
+    chpl_atmi_tls_t      pv = {.bundle = bundle};
+
+    *tls = pv;
+
+    if (bundle->countRunning)
+      chpl_taskRunningCntInc(0, 0);
+    
+    wrap_callbacks(chpl_task_cb_event_kind_begin, bundle);
+
+    // launch GPU kernel here? 
+    (bundle->requested_fn)(arg);
+
+    wrap_callbacks(chpl_task_cb_event_kind_end, bundle);
+
+    if (bundle->countRunning)
+      chpl_taskRunningCntDec(0, 0);
+
+    return 0;
+}
+
+typedef struct {
+    chpl_fn_p fn;
+    void *arg;
+} comm_task_wrapper_info_t;
+
+static void *comm_task_wrapper(void *arg)
+{
+    comm_task_wrapper_info_t *rarg = arg;
+    chpl_moveToLastCPU();
+    (*(chpl_fn_p)(rarg->fn))(rarg->arg);
+    return 0;
+}
+
+// Start the main task.
+//
+// Warning: this method is not called within a Qthread task context. Do
+// not use methods that require task context (e.g., task-local storage).
+void chpl_task_callMain(void (*chpl_main)(void))
+{
+    main_wrapper_bundle_t arg;
+
+    arg.arg.serial_state      = false;
+    arg.arg.countRunning      = false;
+    arg.arg.is_executeOn      = false;
+    arg.arg.requestedSubloc   = c_sublocid_any_val;
+    arg.arg.requested_fid     = FID_NONE;
+    arg.arg.requested_fn      = NULL;
+    arg.arg.requested_fn_name = NULL;
+    arg.arg.lineno            = 0;
+    arg.arg.filename           = CHPL_FILE_IDX_MAIN_TASK;
+    arg.arg.id                = chpl_qthread_process_bundle.id;
+    arg.chpl_main             = chpl_main;
+
+    wrap_callbacks(chpl_task_cb_event_kind_create, &arg.arg);
+
+    int cpu_id = 0;//subloc;
+    ATMI_LPARM_CPU(lparm, cpu_id);
+    lparm->kernel_id = CPU_FUNCTION_IMPL;
+    lparm->synchronous = ATMI_TRUE;
+    void *kernargs[1];
+    //void *arg1 = (void *)&arg;
+    kernargs[0] = (void *)&arg;
+    //atmi_task_launch(lparm, main_kernel, kernargs);
+
+    main_wrapper(&arg);
+    //qthread_fork_syncvar_copyargs(main_wrapper, &arg, sizeof(arg), &exit_ret);
+    //qthread_syncvar_readFF(NULL, &exit_ret);
+}
+
+void chpl_task_stdModulesInitialized(void)
+{
+    //
+    // It's not safe to call the module code to count the main task as
+    // running until after the modules have been initialized.  That's
+    // when this function is called, so now count the main task.
+    //
+    canCountRunningTasks = true;
+    chpl_taskRunningCntInc(0, 0);
+}
+
+int chpl_task_createCommTask(chpl_fn_p fn,
+                             void     *arg)
+{
+    //
+    // The wrapper info must be static because it won't be referred to
+    // until the new pthread calls comm_task_wrapper().  And, it is
+    // safe for it to be static because we will be called at most once
+    // on each node.
+    //
+    static
+        comm_task_wrapper_info_t wrapper_info;
+    wrapper_info.fn = fn;
+    wrapper_info.arg = arg;
+    return pthread_create(&chpl_qthread_comm_pthread,
+                          NULL, comm_task_wrapper, &wrapper_info);
+}
+
+void *chpl_taskGroupGet() {
+    void *ret = (void *)get_atmi_task_group();
+    return ret;
+}
+
+void *chpl_taskGroupInit(int lineno, int32_t filename) {
+    atmi_task_group_t *tg = (atmi_task_group_t *)chpl_malloc(sizeof(atmi_task_group_t));
+    // TODO: add to a list of task groups and free all of them at the very end.
+    int next_id = atomic_fetch_add_explicit_uint_least64_t(&atmi_tg_id, 1, memory_order_relaxed);
+    // loop around the task groups. 
+    // FIXME: how will this affect the main task that is not an ATMI task? Incr by 1 after this?
+    next_id %= max_num_task_groups; 
+    tg->id = next_id;
+    tg->ordered = ATMI_FALSE;
+    return tg;                                                                             
+}
+
+void chpl_taskGroupComplete() {
+    // no-op in ATMI. down count in other tasking layers
+}
+
+void chpl_taskGroupFinalize(void *tg) {
+    int cpu_id = 0; // which subloc?
+    atmi_task_group_t *cur_tg = get_atmi_task_group();
+
+    if(cur_tg) {
+        // if I am within a task, provide a chain dependency to the incoming group
+        ATMI_LPARM_CPU(lparm, cpu_id);
+        lparm->kernel_id = CPU_FUNCTION_IMPL; 
+        lparm->groupable = ATMI_TRUE;
+        lparm->group = cur_tg; 
+        lparm->num_required_groups = 1;
+        lparm->required_groups = (atmi_task_group_t **)&tg; 
+        atmi_task_launch(lparm, dummy_kernel, NULL);
+    }
+    else {
+        // if I am not within a task (main task), simply sync
+        atmi_task_group_sync((atmi_task_group_t *)tg);
+    }
+    //chpl_free(tg);
+}
+
+void chpl_task_addToTaskList(chpl_fn_int_t       fid,
+                             chpl_task_bundle_t *arg,
+                             size_t              arg_size,
+                             c_sublocid_t        subloc,
+                             void              **task_list,
+                             int32_t             task_list_locale,
+                             chpl_bool           is_begin_stmt,
+                             void               *task_group,
+                             int                 lineno,
+                             int32_t             filename)
+{
+    //printf("Adding %d fn to task list\n", fid);
+    chpl_bool serial_state = chpl_task_getSerial();
+    chpl_fn_p requested_fn = chpl_ftable[fid];
+
+    assert(subloc != c_sublocid_none);
+
+    PROFILE_INCR(profile_task_addToTaskList,1);
+
+    if (serial_state) {
+        // call the function directly.
+        requested_fn(arg);
+    } else {
+        arg->serial_state      = false;
+        arg->countRunning      = false;
+        arg->is_executeOn      = false;
+        arg->requestedSubloc   = subloc;
+        arg->requested_fid     = fid;
+        arg->requested_fn      = requested_fn;
+        arg->requested_fn_name = NULL;
+        arg->lineno            = lineno;
+        arg->filename          = filename;
+        arg->id                = chpl_nullTaskID;
+
+        wrap_callbacks(chpl_task_cb_event_kind_create, arg);
+
+        int cpu_id = 0;//subloc;
+        ATMI_LPARM_CPU(lparm, cpu_id);
+        lparm->kernel_id = CPU_FUNCTION_IMPL;
+        //lparm->synchronous = ATMI_TRUE;
+        lparm->groupable = ATMI_TRUE;
+        if(task_group) { 
+            lparm->group = (atmi_task_group_t *)task_group;
+        }
+
+        void *kernargs[1];
+        if(!cpu_kernels_initialized[fid]) {
+            atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size);
+            atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL);
+            cpu_kernels_initialized[fid] = 1;
+        }
+        kernargs[0] = (void *)arg;
+        atmi_task_launch(lparm, cpu_kernels[fid], kernargs);
+        /*if (subloc == c_sublocid_any) {
+            qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL);
+        } else {
+            qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size,
+                                     NULL, (qthread_shepherd_id_t) subloc);
+        }*/
+    }
+}
+
+void chpl_task_executeTasksInList(void **task_list)
+{
+    PROFILE_INCR(profile_task_executeTasksInList,1);
+}
+
+static inline void taskCallBody(chpl_fn_int_t fid, chpl_fn_name fname, chpl_fn_p fp,
+                                void *arg, size_t arg_size,
+                                c_sublocid_t subloc,  chpl_bool serial_state,
+                                int lineno, int32_t filename)
+{
+    chpl_task_bundle_t *bundle = (chpl_task_bundle_t*) arg;
+
+    bundle->serial_state       = serial_state;
+    bundle->countRunning       = canCountRunningTasks;
+    bundle->is_executeOn       = true;
+    bundle->requestedSubloc    = subloc;
+    bundle->requested_fid      = fid;
+    bundle->requested_fn       = fp;
+    bundle->requested_fn_name  = fname;
+    bundle->lineno             = lineno;
+    bundle->filename           = filename;
+    bundle->id                 = chpl_nullTaskID;
+
+    wrap_callbacks(chpl_task_cb_event_kind_create, bundle);
+
+    int cpu_id = 0;//subloc;
+    ATMI_LPARM_CPU(lparm, cpu_id);
+    lparm->kernel_id = CPU_FUNCTION_IMPL;
+    //lparm->synchronous = ATMI_TRUE;
+    void *kernargs[1];
+    if(!cpu_kernels_initialized[fid]) {
+        atmi_kernel_create_empty(&cpu_kernels[fid], 1, &arg_size);
+        atmi_kernel_add_cpu_impl(cpu_kernels[fid], (atmi_generic_fp)chapel_wrapper, CPU_FUNCTION_IMPL);
+        cpu_kernels_initialized[fid] = 1;
+    }
+    kernargs[0] = (void *)arg;
+    atmi_task_launch(lparm, cpu_kernels[fid], kernargs);
+    /*if (subloc < 0) {
+        qthread_fork_copyargs(chapel_wrapper, arg, arg_size, NULL);
+    } else {
+        qthread_fork_copyargs_to(chapel_wrapper, arg, arg_size,
+                                 NULL, (qthread_shepherd_id_t) 0);
+    }*/
+}
+
+void chpl_task_taskCallFTable(chpl_fn_int_t fid,
+                              chpl_task_bundle_t *arg, size_t arg_size,
+                              c_sublocid_t subloc,
+                              int lineno, int32_t filename)
+{
+    PROFILE_INCR(profile_task_taskCall,1);
+
+    taskCallBody(fid, NULL, chpl_ftable[fid], arg, arg_size, subloc, false, lineno, filename);
+}
+
+void chpl_task_startMovedTask(chpl_fn_int_t       fid,
+                              chpl_fn_p           fp,
+                              chpl_task_bundle_t *arg,
+                              size_t              arg_size,
+                              c_sublocid_t        subloc,
+                              chpl_taskID_t       id,
+                              chpl_bool           serial_state)
+{
+    //
+    // For now the incoming task ID is simply dropped, though we check
+    // to make sure the caller wasn't expecting us to do otherwise.  If
+    // we someday make task IDs global we will need to be able to set
+    // the ID of this moved task.
+    //
+    assert(id == chpl_nullTaskID);
+
+    PROFILE_INCR(profile_task_startMovedTask,1);
+
+    taskCallBody(fid, NULL, fp, arg, arg_size, subloc, serial_state, 0, CHPL_FILE_IDX_UNKNOWN);
+}
+
+//
+// chpl_task_getSubloc() is in tasks-qthreads.h
+//
+
+//
+// chpl_task_setSubloc() is in tasks-qthreads.h
+//
+
+//
+// chpl_task_getRequestedSubloc() is in tasks-qthreads.h
+//
+
+
+// Returns '(unsigned int)-1' if called outside of the tasking layer.
+chpl_taskID_t chpl_task_getId(void)
+{
+    PROFILE_INCR(profile_task_getId,1);
+
+    atmi_task_handle_t t = get_atmi_task_handle();
+    // Rationale to return 0 instead of chpl_nullTaskID: ATMI returns -1 as the NULL 
+    // task, but we maintain 0 as the null task in chpl-atmi. 
+    // Impl: chpl-atmi does not create a new task for the main wrapper. Instead it
+    // creates a dummy ATMI task in the beginning but does not launch/activate
+    // it. This helps us recognize 0 as the main/dummy task and not -1 that is
+    // expected by ATMI as the NULL task. 
+    // FIXME: this could go against the chpl design of having *every* task being
+    // launched, including the main wrapper task. Any problems with that?
+    if(t == ATMI_NULL_TASK_HANDLE)
+        return (chpl_taskID_t) 0;
+
+    return t;
+}
+
+void chpl_task_sleep(double secs)
+{
+    if (qthread_shep() == NO_SHEPHERD) {
+        struct timeval deadline;
+        struct timeval now;
+
+        //
+        // Figure out when this task can proceed again, and until then, keep
+        // yielding.
+        //
+        gettimeofday(&deadline, NULL);
+        deadline.tv_usec += (suseconds_t) lround((secs - trunc(secs)) * 1.0e6);
+        if (deadline.tv_usec > 1000000) {
+            deadline.tv_sec++;
+            deadline.tv_usec -= 1000000;
+        }
+        deadline.tv_sec += (time_t) trunc(secs);
+
+        do {
+            chpl_task_yield();
+            gettimeofday(&now, NULL);
+        } while (now.tv_sec < deadline.tv_sec
+                 || (now.tv_sec == deadline.tv_sec
+                     && now.tv_usec < deadline.tv_usec));
+    } else {
+        qtimer_t t = qtimer_create();
+        qtimer_start(t);
+        do {
+            qthread_yield();
+            qtimer_stop(t);
+        } while (qtimer_secs(t) < secs);
+        qtimer_destroy(t);
+    }
+}
+
+/* The get- and setSerial() methods assume the beginning of the task-local
+ * data segment holds a chpl_bool denoting the serial state. */
+#if 0
+chpl_bool get_serial(chpl_taskID_t id) {
+    if(g_task_map.find(id) == g_task_map.end()) 
+        return true;
+    else
+        return g_task_map[id];
+}
+#endif
+
+chpl_bool chpl_task_getSerial(void)
+{
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+
+    PROFILE_INCR(profile_task_getSerial,1);
+    
+    return data->bundle->serial_state;
+}
+
+void chpl_task_setSerial(chpl_bool state)
+{
+    chpl_atmi_tls_t * data = chpl_atmi_get_tasklocal();
+    data->bundle->serial_state = state;
+
+    PROFILE_INCR(profile_task_setSerial,1);
+}
+
+uint32_t chpl_task_getMaxPar(void) {
+    //chpl_internal_error("Qthreads max tasks par asked\n");
+    //printf("Qthreads max task par asked\n");
+    //
+    // We assume here that the caller (in the LocaleModel module code)
+    // is interested in the number of workers on the whole node, and
+    // will decide itself how much parallelism to create across and
+    // within sublocales, if there are any.
+    //
+    return qthread_num_workers();
+    //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count;
+}
+
+c_sublocid_t chpl_task_getNumSublocales(void)
+{
+    // FIXME: What we really want here is the number of NUMA
+    // sublocales we are supporting.  For now we use the number of
+    // shepherds as a proxy for that.
+    // return (c_sublocid_t) qthread_num_shepherds();
+    return (c_sublocid_t) g_machine->device_count_by_type[ATMI_DEVTYPE_ALL];
+}
+
+size_t chpl_task_getCallStackSize(void)
+{
+    PROFILE_INCR(profile_task_getCallStackSize,1);
+
+    return qthread_readstate(STACK_SIZE);
+}
+
+// XXX: Should probably reflect all shepherds
+uint32_t chpl_task_getNumQueuedTasks(void)
+{
+    return qthread_readstate(NODE_BUSYNESS);
+}
+
+uint32_t chpl_task_getNumRunningTasks(void)
+{
+    chpl_internal_error("chpl_task_getNumRunningTasks() called");
+    return 1;
+}
+
+int32_t chpl_task_getNumBlockedTasks(void)
+{
+    // This isn't accurate, but in the absence of better information
+    // it's the best we can do.
+    return 0;
+}
+
+// Threads
+
+uint32_t chpl_task_getNumThreads(void)
+{
+    return qthread_num_workers();
+    //return (uint32_t) g_machine->devices_by_type[ATMI_DEVTYPE_CPU][0].core_count;
+}
+
+// Ew. Talk about excessive bookkeeping.
+uint32_t chpl_task_getNumIdleThreads(void)
+{
+    return 0;
+}
+
+/* vim:set expandtab: */
diff --git a/util/printchplenv b/util/printchplenv
index e66b1d2694..5ad79dff86 100755
--- a/util/printchplenv
+++ b/util/printchplenv
@@ -175,6 +175,8 @@ def print_mode(mode='list', anonymize=False):
                   mode, filters=('runtime',))
         if tasks == 'qthreads':
             link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args())
+        if tasks == 'atmi':
+            link_args_3p.extend(chpl_3p_qthreads_configs.get_link_args())
 
         print_var('  CHPL_RE2_UNIQ_CFG_PATH',
                   chpl_3p_re2_configs.get_uniq_cfg_path(),
diff --git a/util/setchplenv_hsa.bash b/util/setchplenv_hsa.bash
index cfd76b1c41..0dee283cc9 100644
--- a/util/setchplenv_hsa.bash
+++ b/util/setchplenv_hsa.bash
@@ -61,8 +61,8 @@ export CHPL_COMM=none
 echo " to none"
 
 echo -n "Setting CHPL_TASKS"
-export CHPL_TASKS=qthreads
-echo " to qthreads"
+export CHPL_TASKS=atmi
+echo " to atmi"
 
 echo -n "Setting CHPL_ATOMICS"
 export CHPL_ATOMICS=intrinsics
@@ -92,6 +92,10 @@ echo -n "Disabling NUMA"
 export CHPL_HWLOC_CFG_OPTIONS=" --disable-libnuma"
 echo " done"
 
+echo -n "Setting CHPL_HWLOC"
+export CHPL_HWLOC=hwloc
+echo " to hwloc"
+
 echo -n "Disabling LLVM support"
 export CHPL_LLVM=none
 echo " done"
@@ -105,6 +109,10 @@ if [ "$1" == "debug" ]; then
     export CHPL_DEBUG=1
 fi
 
+echo -n "Setting CHPL_ROCM"
+export CHPL_ROCM=1
+echo " to 1"
+
 echo -n "Setting CHPL_LOCALE_MODEL"
 export CHPL_LOCALE_MODEL=hsa
 echo " to hsa"