diff --git a/src/hamiltonian/energy.hpp b/src/hamiltonian/energy.hpp index c35a6ea4..65efb6aa 100644 --- a/src/hamiltonian/energy.hpp +++ b/src/hamiltonian/energy.hpp @@ -78,12 +78,8 @@ namespace hamiltonian { eigenvalues_ += occ_sum(el.occupations()[iphi], el.eigenvalues()[iphi]); } - { - CALI_CXX_MARK_SCOPE("energy::calculate::non_local"); - auto nl_me = operations::overlap_diagonal_normalized(ham.non_local(phi), phi); - non_local_ += occ_sum(el.occupations()[iphi], nl_me); - } - + non_local_ += ham.non_local_energy(phi, el.occupations()[iphi], /*reduce_states = */ false); + if(ham.exchange().enabled()){ CALI_CXX_MARK_SCOPE("energy::calculate::exchange"); auto exchange_me = operations::overlap_diagonal_normalized(ham.exchange()(phi), phi); diff --git a/src/hamiltonian/ks_hamiltonian.hpp b/src/hamiltonian/ks_hamiltonian.hpp index 8aa823b5..ab859421 100644 --- a/src/hamiltonian/ks_hamiltonian.hpp +++ b/src/hamiltonian/ks_hamiltonian.hpp @@ -49,7 +49,30 @@ class ks_hamiltonian { std::unordered_map projectors_fourier_map_; std::vector::iterator> projectors_fourier_; states::ks_states states_; - + +#ifdef ENABLE_CUDA +public: +#endif + + template + struct occ_sum_func { + OccType occ; + ArrayType arr; + + GPU_FUNCTION double operator()(long ip) const { + return occ[ip]*real(arr[ip]); + } + }; + + template + static double occ_sum(OccType const & occupations, ArrayType const & array) { + CALI_CXX_MARK_FUNCTION; + + assert(occupations.size() == array.size()); + auto func = occ_sum_func{begin(occupations), begin(array)}; + return gpu::run(gpu::reduce(array.size()), func); + } + public: void update_projectors(const basis::real_space & basis, const atomic_potential & pot, systems::ions const & ions){ @@ -98,7 +121,7 @@ class ks_hamiltonian { it->second(phi, vnlphi); } } - + //////////////////////////////////////////////////////////////////////////////////////////// auto non_local(const states::orbital_set & phi) const { @@ -129,6 +152,26 @@ class ks_hamiltonian { //////////////////////////////////////////////////////////////////////////////////////////// + template + auto non_local_energy(states::orbital_set const & phi, Occupations const & occupations, bool const reduce_states = true) const { + + CALI_CXX_MARK_FUNCTION; + + if(non_local_in_fourier_) { + + auto nl_me = operations::overlap_diagonal_normalized(non_local(phi), phi); + auto en = occ_sum(occupations, nl_me); + if(reduce_states and phi.set_comm().size() > 1) phi.set_comm().all_reduce_in_place_n(&en, 1, std::plus<>{}); + return en; + + } else { + return projectors_all_.energy(phi, phi.kpoint() + uniform_vector_potential_, occupations, reduce_states); + } + + } + + //////////////////////////////////////////////////////////////////////////////////////////// + auto operator()(const states::orbital_set & phi) const { CALI_CXX_MARK_SCOPE("hamiltonian_real"); diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp index 203e7492..1ee88a67 100644 --- a/src/hamiltonian/projector_all.hpp +++ b/src/hamiltonian/projector_all.hpp @@ -244,6 +244,104 @@ class projector_all { }; //////////////////////////////////////////////////////////////////////////////////////////// + + template + struct energy_reduction { + Projections proj; + Coeff coe; + Occupations occ; + long spinor_size; + + GPU_FUNCTION auto operator()(long ist, long ilm, long iproj) const { + auto ist_spinor = ist%spinor_size; + auto pp = proj[iproj][ilm][ist]; + return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor]; + } + + }; + + //////////////////////////////////////////////////////////////////////////////////////////// + + template + double energy(states::orbital_set const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const { + + gpu::array sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()}); + gpu::array projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0); + + { CALI_CXX_MARK_SCOPE("projector::gather"); + + gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_, + [sgr = begin(sphere_phi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){ + if(poi[iproj][ipoint][0] >= 0){ + auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint])); + sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist]; + } else { + sgr[iproj][ipoint][ist] = complex(0.0, 0.0); + } + }); + } + +#ifndef ENABLE_CUDA + for(auto iproj = 0; iproj < nprojs_; iproj++){ + CALI_CXX_MARK_SCOPE("projector_gemm_1"); + + if(locally_empty_[iproj]) continue; + + namespace blas = boost::multi::blas; + blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj])); + } +#else + if(max_sphere_size_ > 0) { + CALI_CXX_MARK_SCOPE("projector_gemm_1"); + + const double zero = 0.0; + const double vol = phi.basis().volume_element(); + + auto status = cublasDgemmStridedBatched(/*cublasHandle_t handle = */ boost::multi::cuda::cublas::context::get_instance().get(), + /*cublasOperation_t transa = */ CUBLAS_OP_N, + /*cublasOperation_t transb = */ CUBLAS_OP_N, + /*int m = */ 2*phi.local_set_size(), + /*int n = */ max_nlm_, + /*int k = */ max_sphere_size_, + /*const double *alpha = */ &vol, + /*const double *A = */ reinterpret_cast(raw_pointer_cast(sphere_phi_all.data_elements())), + /*int lda = */ 2*phi.local_set_size(), + /*long long int strideA = */ 2*max_sphere_size_*phi.local_set_size(), + /*const double *B = */ raw_pointer_cast(matrices_.data_elements()), + /*int ldb = */ max_sphere_size_, + /*long long int strideB =*/ max_nlm_*max_sphere_size_, + /*const double *beta = */ &zero, + /*double *C = */ reinterpret_cast(raw_pointer_cast(projections_all.data_elements())), + /*int ldc = */ 2*phi.local_set_size(), + /*long long int strideC = */ 2*max_nlm_*phi.local_set_size(), + /*int batchCount = */ nprojs_); + gpu::sync(); + + assert(status == CUBLAS_STATUS_SUCCESS); + + } +#endif + + if(phi.basis().comm().size() > 1) { + CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_basis"); + phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{}); + } + + auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_), + energy_reduction + {begin(projections_all), begin(coeff_), begin(occupations), phi.local_spinor_set_size()}); + + if(reduce_states and phi.set_comm().size() > 1) { + CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_states"); + phi.set_comm().all_reduce_in_place_n(&en, 1, std::plus<>{}); + } + + return en; + + } + + //////////////////////////////////////////////////////////////////////////////////////////// + template void force(PhiType & phi, GPhiType const & gphi, MetricType const & metric,