Skip to content

Commit

Permalink
Merge pull request #8884 from iyamazaki/frosch-gpu
Browse files Browse the repository at this point in the history
Frosch: use paralllel-for in compute phase
  • Loading branch information
searhein authored Mar 30, 2021
2 parents ae5bc32 + 766c329 commit 0277006
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,48 @@ namespace FROSch {
XMapPtr serialMap = MapFactory<LO,GO,NO>::Build(AssembledBasisMap_->lib(),totalSize,0,this->SerialComm_);

AssembledBasis_ = MultiVectorFactory<SC,LO,GO,NO >::Build(serialMap,AssembledBasisMap_->getNodeNumElements());
for (UN i=0; i<UnassembledSubspaceBases_.size(); i++) {
if (!UnassembledSubspaceBases_[i].is_null()) {
for (UN j=0; j<UnassembledSubspaceBases_[i]->getNumVectors(); j++) {
ConstSCVecPtr unassembledSubspaceBasesData = UnassembledSubspaceBases_[i]->getData(j);
for (UN k=0; k<UnassembledSubspaceBases_[i]->getLocalLength(); k++) {
FROSCH_ASSERT(itmp<AssembledBasis_->getNumVectors(),"FROSch::CoarseSpace: itmp>=AssembledBasis_->getNumVectors()");
FROSCH_ASSERT(k+Offsets_[i]<AssembledBasis_->getLocalLength(),"FROSch::CoarseSpace: k+Offsets_[i]>=AssembledBasis_->getLocalLength()");
AssembledBasis_->replaceLocalValue(k+Offsets_[i],itmp,unassembledSubspaceBasesData[k]);
#if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA)
if (AssembledBasis_->getMap()->lib() == UseTpetra) {
UN itmp = 0;
for (UN i=0; i<UnassembledSubspaceBases_.size(); i++) {
if (!UnassembledSubspaceBases_[i].is_null()) {
const UN Offset_i = Offsets_[i];
const UN NumVectors_i = UnassembledSubspaceBases_[i]->getNumVectors();
const UN LocalLength_i = UnassembledSubspaceBases_[i]->getLocalLength();

FROSCH_ASSERT(NumVectors_i+itmp <= AssembledBasis_->getNumVectors(),"FROSch::CoarseSpace: NumVectors_i+itmp <= AssembledBasis_->getNumVectors()");
FROSCH_ASSERT(LocalLength_i+Offsets_[i] <= AssembledBasis_->getLocalLength(),"FROSch::CoarseSpace: LocalLength_i+Offsets_[i] <= AssembledBasis_");

for (UN j=0; j < NumVectors_i; j++) {
auto unassembledSubspaceBasesData = UnassembledSubspaceBases_[i]->getData(j).getRawPtr();
auto assembledSubspaceBasesData = AssembledBasis_->getDataNonConst(itmp+j);

using execution_space = typename XMap::local_map_type::execution_space;
Kokkos::RangePolicy<execution_space> policy (0, LocalLength_i);
Kokkos::parallel_for(
"FROSch_CoarseSpace::assembleCoarseSpace", policy,
KOKKOS_LAMBDA(const UN k) {
assembledSubspaceBasesData[k+Offset_i] = unassembledSubspaceBasesData[k];
});
}
itmp += NumVectors_i;
}
}
Kokkos::fence();
} else
#endif
{
for (UN i=0; i<UnassembledSubspaceBases_.size(); i++) {
if (!UnassembledSubspaceBases_[i].is_null()) {
for (UN j=0; j<UnassembledSubspaceBases_[i]->getNumVectors(); j++) {
ConstSCVecPtr unassembledSubspaceBasesData = UnassembledSubspaceBases_[i]->getData(j);
for (UN k=0; k<UnassembledSubspaceBases_[i]->getLocalLength(); k++) {
FROSCH_ASSERT(itmp<AssembledBasis_->getNumVectors(),"FROSch::CoarseSpace: itmp>=AssembledBasis_->getNumVectors()");
FROSCH_ASSERT(k+Offsets_[i]<AssembledBasis_->getLocalLength(),"FROSch::CoarseSpace: k+Offsets_[i]>=AssembledBasis_->getLocalLength()");
AssembledBasis_->replaceLocalValue(k+Offsets_[i],itmp,unassembledSubspaceBasesData[k]);
}
itmp++;
}
itmp++;
}
}
}
Expand Down Expand Up @@ -171,48 +203,132 @@ namespace FROSch {
FROSCH_ASSERT(!AssembledBasisMap_.is_null(),"FROSch::CoarseSpace: AssembledBasisMap_.is_null().");
FROSCH_ASSERT(!AssembledBasis_.is_null(),"FROSch::CoarseSpace: AssembledBasis_.is_null().");

if (rowMap->lib()==UseEpetra) {
GlobalBasisMatrix_ = MatrixFactory<SC,LO,GO,NO>::Build(rowMap,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!!
LO iD;
SC valueTmp;
for (UN i=0; i<AssembledBasis_->getLocalLength(); i++) {
GOVec indices;
SCVec values;
for (UN j=0; j<AssembledBasis_->getNumVectors(); j++) {
valueTmp=AssembledBasis_->getData(j)[i];
if (fabs(valueTmp)>treshold) {
indices.push_back(AssembledBasisMap_->getGlobalElement(j));
values.push_back(valueTmp);
#if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA)
if (rowMap->lib() == UseTpetra) {
UN numRows = AssembledBasis_->getLocalLength();
UN numCols = AssembledBasis_->getNumVectors();

using crsmat_type = typename XMatrix::local_matrix_type;
using graph_type = typename crsmat_type::StaticCrsGraphType;
using rowptr_type = typename graph_type::row_map_type::non_const_type;
using indices_type = typename graph_type::entries_type::non_const_type;
using values_type = typename crsmat_type::values_type::non_const_type;

using execution_space = typename XMap::local_map_type::execution_space;
Kokkos::RangePolicy<execution_space> policy_col (0, numCols);
Kokkos::RangePolicy<execution_space> policy_row (0, numRows);

auto repeatedLocalMap = repeatedMap->getLocalMap();
auto rowLocalMap = rowMap->getLocalMap();
auto AssembledBasisView = AssembledBasis_->getDeviceLocalView();

// count number of nonzeros per row
UN numLocalRows = rowMap->getNodeNumElements();
rowptr_type Rowptr ("Rowptr", numLocalRows+1);
Kokkos::deep_copy(Rowptr, 0);
Kokkos::parallel_for(
"FROSch_CoarseSpace::countGlobalBasisMatrix", policy_row,
KOKKOS_LAMBDA(const UN i) {
LO iD = repeatedLocalMap.getGlobalElement(i);
LO lo = rowLocalMap.getLocalElement(iD);
if (lo != -1) {
for (UN j=0; j<numCols; j++) {
SC valueTmp=AssembledBasisView(i, j);
if (fabs(valueTmp) > treshold) {
Rowptr[lo+1] ++;
}
}
}
}
iD = repeatedMap->getGlobalElement(i);
});
Kokkos::fence();

// make it into offsets
KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum<rowptr_type, execution_space>
(1+numLocalRows, Rowptr);

// fill into the local matrix
UN nnz = Rowptr[numLocalRows];
indices_type Indices ("Indices", nnz);
values_type Values ("Values", nnz);
auto AssembledBasisLocalMap = AssembledBasisMap_->getLocalMap();
Kokkos::parallel_for(
"FROSch_CoarseSpace::fillGlobalBasisMatrix", policy_row,
KOKKOS_LAMBDA(const UN i) {
LO iD = repeatedLocalMap.getGlobalElement(i);
LO lo = rowLocalMap.getLocalElement(iD);
if (lo != -1) { // This should prevent duplicate entries on the interface
UN nnz_i = Rowptr[lo];
for (UN j=0; j<numCols; j++) {
SC valueTmp=AssembledBasisView(i, j);
if (fabs(valueTmp) > treshold) {
Values[nnz_i] = valueTmp;
Indices[nnz_i] = j;

nnz_i ++;
}
}
}
});
Kokkos::fence();

// create local matrix
graph_type crsgraph (Indices, Rowptr);
crsmat_type LocalBasisMatrix = crsmat_type ("CrsMatrix", numCols, Values, crsgraph);

/// build into GlobalBasisMatrix
Teuchos::RCP<Teuchos::ParameterList> params = Teuchos::rcp (new Teuchos::ParameterList());
params->set("sorted", false);
GlobalBasisMatrix_ = MatrixFactory<SC,LO,GO,NO>::Build(LocalBasisMatrix,
rowMap, AssembledBasisMap_,
AssembledBasisMapUnique_, rangeMap,
params);
} else
#endif
{
if (rowMap->lib()==UseEpetra) {
GlobalBasisMatrix_ = MatrixFactory<SC,LO,GO,NO>::Build(rowMap,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!!
LO iD;
SC valueTmp;
for (UN i=0; i<AssembledBasis_->getLocalLength(); i++) {
GOVec indices;
SCVec values;
for (UN j=0; j<AssembledBasis_->getNumVectors(); j++) {
valueTmp=AssembledBasis_->getData(j)[i];
if (fabs(valueTmp)>treshold) {
indices.push_back(AssembledBasisMap_->getGlobalElement(j));
values.push_back(valueTmp);
}
}
iD = repeatedMap->getGlobalElement(i);

if (rowMap->getLocalElement(iD)!=-1) { // This should prevent duplicate entries on the interface
GlobalBasisMatrix_->insertGlobalValues(iD,indices(),values());
}
}
} else {
GlobalBasisMatrix_ = MatrixFactory<SC,LO,GO,NO>::Build(rowMap,AssembledBasisMap_,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!!
LO iD;
SC valueTmp;
for (UN i=0; i<AssembledBasis_->getLocalLength(); i++) {
LOVec indices;
SCVec values;
for (UN j=0; j<AssembledBasis_->getNumVectors(); j++) {
valueTmp=AssembledBasis_->getData(j)[i];
if (fabs(valueTmp)>treshold) {
indices.push_back(j);
values.push_back(valueTmp);
if (rowMap->getLocalElement(iD)!=-1) { // This should prevent duplicate entries on the interface
GlobalBasisMatrix_->insertGlobalValues(iD,indices(),values());
}
}
iD = rowMap->getLocalElement(repeatedMap->getGlobalElement(i));
} else {
GlobalBasisMatrix_ = MatrixFactory<SC,LO,GO,NO>::Build(rowMap,AssembledBasisMap_,AssembledBasisMap_->getNodeNumElements()); // Nonzeroes abhängig von dim/dofs!!!
LO iD;
SC valueTmp;
for (UN i=0; i<AssembledBasis_->getLocalLength(); i++) {
LOVec indices;
SCVec values;
for (UN j=0; j<AssembledBasis_->getNumVectors(); j++) {
valueTmp=AssembledBasis_->getData(j)[i];
if (fabs(valueTmp)>treshold) {
indices.push_back(j);
values.push_back(valueTmp);
}
}
iD = rowMap->getLocalElement(repeatedMap->getGlobalElement(i));

if (iD!=-1) { // This should prevent duplicate entries on the interface
GlobalBasisMatrix_->insertLocalValues(iD,indices(),values());
if (iD!=-1) { // This should prevent duplicate entries on the interface
GlobalBasisMatrix_->insertLocalValues(iD,indices(),values());
}
}
}
GlobalBasisMatrix_->fillComplete(AssembledBasisMapUnique_,rangeMap); //RCP<FancyOStream> fancy = fancyOStream(rcpFromRef(cout)); GlobalBasisMatrix_->describe(*fancy,VERB_EXTREME);
}
GlobalBasisMatrix_->fillComplete(AssembledBasisMapUnique_,rangeMap); //RCP<FancyOStream> fancy = fancyOStream(rcpFromRef(cout)); GlobalBasisMatrix_->describe(*fancy,VERB_EXTREME);

return 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,27 @@ namespace FROSch {

XMapPtr computeCoarseSpace(CoarseSpacePtr coarseSpace);

#if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA)
template<class GOIndView>
struct CopyPhiDataFunctor
{
CopyPhiDataFunctor(SCVecPtr data_out_, ConstSCVecPtr data_in_, GOIndView indices_) :
data_out(data_out_),
data_in (data_in_),
indices (indices_)
{}

KOKKOS_INLINE_FUNCTION
void operator()(const int k) const {
data_out[indices[k]] = data_in[k];
}

SCVecPtr data_out;
ConstSCVecPtr data_in;
GOIndView indices;
};
#endif

protected:

int intializeCoarseMap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -770,14 +770,44 @@ namespace FROSch {
}
}

for (UN j=0; j<numLocalBlockColumns[i]; j++) {
ConstSCVecPtr mVPhiIData = mVPhiI->getData(itmp);
for (UN ii=0; ii<extensionBlocks.size(); ii++) {
for (LO k=bound[extensionBlocks[ii]]; k<bound[extensionBlocks[ii]+1]; k++) {
mVPhi->replaceLocalValue(indicesIDofsAll[k],itmp,mVPhiIData[k]);
#if defined(HAVE_XPETRA_KOKKOS_REFACTOR) && defined(HAVE_XPETRA_TPETRA)
if (mVPhi->getMap()->lib() == UseTpetra) {
using XMap = typename SchwarzOperator<SC,LO,GO,NO>::XMap;
using execution_space = typename XMap::local_map_type::execution_space;

// explicitly copying indicesIDofsAll in Teuchos::Array to Kokkos::View on "device"
using GOIndViewHost = Kokkos::View<GO*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
using GOIndView = Kokkos::View<GO*, execution_space>;
UN numIndices = indicesIDofsAll.size();
GOIndViewHost indicesIDofsAllHostData(indicesIDofsAll.getRawPtr(), numIndices);
GOIndView indicesIDofsAllData ("indicesIDofsAllData", numIndices);
Kokkos::deep_copy(indicesIDofsAllData, indicesIDofsAllHostData);

for (UN j=0; j<numLocalBlockColumns[i]; j++) {
auto mVPhiIData = mVPhiI->getData(itmp);
auto mVPhiData = mVPhi->getDataNonConst(itmp);

for (UN ii=0; ii<extensionBlocks.size(); ii++) {
Kokkos::RangePolicy<execution_space> policy (bound[extensionBlocks[ii]], bound[extensionBlocks[ii]+1]);
CopyPhiDataFunctor<GOIndView> functor(mVPhiData, mVPhiIData, indicesIDofsAllData);
Kokkos::parallel_for(
"FROSch_HarmonicCoarseOperator::fillPhiData", policy, functor);
}
itmp++;
}
Kokkos::fence();
} else
#endif
{
for (UN j=0; j<numLocalBlockColumns[i]; j++) {
ConstSCVecPtr mVPhiIData = mVPhiI->getData(itmp);
for (UN ii=0; ii<extensionBlocks.size(); ii++) {
for (LO k=bound[extensionBlocks[ii]]; k<bound[extensionBlocks[ii]+1]; k++) {
mVPhi->replaceLocalValue(indicesIDofsAll[k],itmp,mVPhiIData[k]);
}
}
itmp++;
}
itmp++;
}
}
return mVPhi;
Expand Down

0 comments on commit 0277006

Please sign in to comment.