diff --git a/src/hamiltonian/energy.hpp b/src/hamiltonian/energy.hpp
index c35a6ea4..65efb6aa 100644
--- a/src/hamiltonian/energy.hpp
+++ b/src/hamiltonian/energy.hpp
@@ -78,12 +78,8 @@ namespace hamiltonian {
 					eigenvalues_ += occ_sum(el.occupations()[iphi], el.eigenvalues()[iphi]);
 				}
 
-				{
-					CALI_CXX_MARK_SCOPE("energy::calculate::non_local");
-					auto nl_me = operations::overlap_diagonal_normalized(ham.non_local(phi), phi);
-					non_local_ += occ_sum(el.occupations()[iphi], nl_me);
-				}
-
+				non_local_ += ham.non_local_energy(phi, el.occupations()[iphi], /*reduce_states = */ false);
+				
 				if(ham.exchange().enabled()){
 					CALI_CXX_MARK_SCOPE("energy::calculate::exchange");
 					auto exchange_me = operations::overlap_diagonal_normalized(ham.exchange()(phi), phi);
diff --git a/src/hamiltonian/ks_hamiltonian.hpp b/src/hamiltonian/ks_hamiltonian.hpp
index 8aa823b5..ab859421 100644
--- a/src/hamiltonian/ks_hamiltonian.hpp
+++ b/src/hamiltonian/ks_hamiltonian.hpp
@@ -49,7 +49,30 @@ class ks_hamiltonian {
 	std::unordered_map<std::string, projector_fourier> projectors_fourier_map_;
 	std::vector<std::unordered_map<std::string, projector_fourier>::iterator> projectors_fourier_;
 	states::ks_states states_;
-	
+
+#ifdef ENABLE_CUDA
+public:
+#endif
+		
+		template <typename OccType, typename ArrayType>
+		struct occ_sum_func {
+			OccType   occ;
+			ArrayType arr;
+
+			GPU_FUNCTION double operator()(long ip) const {
+				return occ[ip]*real(arr[ip]);
+			}
+		};
+		
+		template <typename OccType, typename ArrayType>
+		static double occ_sum(OccType const & occupations, ArrayType const & array) {
+			CALI_CXX_MARK_FUNCTION;
+			
+			assert(occupations.size() == array.size());
+			auto func = occ_sum_func<decltype(begin(occupations)), decltype(begin(array))>{begin(occupations), begin(array)};
+			return gpu::run(gpu::reduce(array.size()), func);
+		}
+
 public:
 	
 	void update_projectors(const basis::real_space & basis, const atomic_potential & pot, systems::ions const & ions){
@@ -98,7 +121,7 @@ class ks_hamiltonian {
 			it->second(phi, vnlphi);
 		}
 	}
-		
+	
 	////////////////////////////////////////////////////////////////////////////////////////////
 		
 	auto non_local(const states::orbital_set<basis::real_space, complex> & phi) const {
@@ -129,6 +152,26 @@ class ks_hamiltonian {
 
 	////////////////////////////////////////////////////////////////////////////////////////////
 
+	template <typename Occupations>
+	auto non_local_energy(states::orbital_set<basis::real_space, complex> const & phi, Occupations const & occupations, bool const reduce_states = true) const {
+
+		CALI_CXX_MARK_FUNCTION;
+
+		if(non_local_in_fourier_) {
+			
+			auto nl_me = operations::overlap_diagonal_normalized(non_local(phi), phi);
+			auto en = occ_sum(occupations, nl_me);
+			if(reduce_states and phi.set_comm().size() > 1) phi.set_comm().all_reduce_in_place_n(&en, 1, std::plus<>{});
+			return en;
+
+		} else {
+			return projectors_all_.energy(phi, phi.kpoint() + uniform_vector_potential_, occupations, reduce_states);
+		}
+		
+	}
+	
+	////////////////////////////////////////////////////////////////////////////////////////////
+
 	auto operator()(const states::orbital_set<basis::real_space, complex> & phi) const {
 			
 		CALI_CXX_MARK_SCOPE("hamiltonian_real");
diff --git a/src/hamiltonian/projector_all.hpp b/src/hamiltonian/projector_all.hpp
index 203e7492..1ee88a67 100644
--- a/src/hamiltonian/projector_all.hpp
+++ b/src/hamiltonian/projector_all.hpp
@@ -244,6 +244,104 @@ class projector_all {
 	};
 
 	////////////////////////////////////////////////////////////////////////////////////////////
+
+	template <typename Projections, typename Coeff, typename Occupations>
+	struct energy_reduction {
+		Projections proj;
+		Coeff coe;
+		Occupations occ;
+		long spinor_size;
+
+		GPU_FUNCTION auto operator()(long ist, long ilm, long iproj) const {
+			auto ist_spinor = ist%spinor_size;
+			auto pp = proj[iproj][ilm][ist];
+			return real(conj(pp)*pp)*coe[iproj][ilm]*occ[ist_spinor];
+		}
+		
+	};
+	
+	////////////////////////////////////////////////////////////////////////////////////////////
+
+	template <typename KpointType, typename Occupations>
+	double energy(states::orbital_set<basis::real_space, complex> const & phi, KpointType const & kpoint, Occupations const & occupations, bool const reduce_states = true) const {
+    
+		gpu::array<complex, 3> sphere_phi_all({nprojs_, max_sphere_size_, phi.local_set_size()});
+		gpu::array<complex, 3> projections_all({nprojs_, max_nlm_, phi.local_set_size()}, 0.0);
+
+		{ CALI_CXX_MARK_SCOPE("projector::gather");
+				
+			gpu::run(phi.local_set_size(), max_sphere_size_, nprojs_,
+							 [sgr = begin(sphere_phi_all), gr = begin(phi.hypercubic()), poi = begin(points_), pos = begin(positions_), kpoint] GPU_LAMBDA (auto ist, auto ipoint, auto iproj){
+								 if(poi[iproj][ipoint][0] >= 0){
+									 auto phase = polar(1.0, dot(kpoint, pos[iproj][ipoint]));
+									 sgr[iproj][ipoint][ist] = phase*gr[poi[iproj][ipoint][0]][poi[iproj][ipoint][1]][poi[iproj][ipoint][2]][ist];
+								 } else {
+									 sgr[iproj][ipoint][ist] = complex(0.0, 0.0);
+								 }
+							 });
+		}
+
+#ifndef ENABLE_CUDA
+		for(auto iproj = 0; iproj < nprojs_; iproj++){
+			CALI_CXX_MARK_SCOPE("projector_gemm_1");
+
+			if(locally_empty_[iproj]) continue;
+			
+			namespace blas = boost::multi::blas;
+			blas::real_doubled(projections_all[iproj]) = blas::gemm(phi.basis().volume_element(), matrices_[iproj], blas::real_doubled(sphere_phi_all[iproj]));
+		}
+#else
+		if(max_sphere_size_ > 0) {
+			CALI_CXX_MARK_SCOPE("projector_gemm_1");			
+
+			const double zero = 0.0;
+			const double vol = phi.basis().volume_element();
+
+			auto status = cublasDgemmStridedBatched(/*cublasHandle_t handle = */ boost::multi::cuda::cublas::context::get_instance().get(),
+																							/*cublasOperation_t transa = */ CUBLAS_OP_N,
+																							/*cublasOperation_t transb = */ CUBLAS_OP_N,
+																							/*int m = */ 2*phi.local_set_size(),
+																							/*int n = */ max_nlm_,
+																							/*int k = */ max_sphere_size_,
+																							/*const double *alpha = */ &vol,
+																							/*const double *A = */ reinterpret_cast<double const *>(raw_pointer_cast(sphere_phi_all.data_elements())),
+																							/*int lda = */ 2*phi.local_set_size(),
+																							/*long long int strideA = */ 2*max_sphere_size_*phi.local_set_size(),
+																							/*const double *B = */ raw_pointer_cast(matrices_.data_elements()),
+																							/*int ldb = */ max_sphere_size_,
+																							/*long long int strideB =*/ max_nlm_*max_sphere_size_,
+																							/*const double *beta = */ &zero,
+																							/*double *C = */ reinterpret_cast<double *>(raw_pointer_cast(projections_all.data_elements())),
+																							/*int ldc = */ 2*phi.local_set_size(),
+																							/*long long int strideC = */ 2*max_nlm_*phi.local_set_size(),
+																							/*int batchCount = */ nprojs_);
+			gpu::sync();
+			
+			assert(status == CUBLAS_STATUS_SUCCESS);
+			
+		}
+#endif
+
+		if(phi.basis().comm().size() > 1) {
+			CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_basis");
+			phi.basis().comm().all_reduce_in_place_n(raw_pointer_cast(projections_all.data_elements()), projections_all.num_elements(), std::plus<>{});
+		}
+		
+		auto en = gpu::run(gpu::reduce(phi.local_set_size()), gpu::reduce(max_nlm_), gpu::reduce(nprojs_),
+											 energy_reduction<decltype(begin(projections_all)), decltype(begin(coeff_)), decltype(begin(occupations))>
+											 {begin(projections_all), begin(coeff_), begin(occupations), phi.local_spinor_set_size()});
+		
+		if(reduce_states and phi.set_comm().size() > 1) {
+			CALI_CXX_MARK_SCOPE("projector_all::energy::reduce_states");
+			phi.set_comm().all_reduce_in_place_n(&en, 1, std::plus<>{});
+		}
+
+		return en;
+			
+	}
+
+	////////////////////////////////////////////////////////////////////////////////////////////
+
 	
 	template <typename PhiType, typename GPhiType, typename MetricType, typename OccsType>
 	void force(PhiType & phi, GPhiType const & gphi, MetricType const & metric,