// Copyright 2023 The Abseil Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // ----------------------------------------------------------------------------- // File: prefetch.h // ----------------------------------------------------------------------------- // // This header file defines prefetch functions to prefetch memory contents // into the first level cache (L1) for the current CPU. The prefetch logic // offered in this header is limited to prefetching first level cachelines // only, and is aimed at relatively 'simple' prefetching logic. // #ifndef ABSL_BASE_PREFETCH_H_ #define ABSL_BASE_PREFETCH_H_ #include "absl/base/attributes.h" #include "absl/base/config.h" #if defined(ABSL_INTERNAL_HAVE_SSE) #include #endif #if defined(_MSC_VER) #include #if defined(ABSL_INTERNAL_HAVE_SSE) #pragma intrinsic(_mm_prefetch) #endif #endif namespace absl { ABSL_NAMESPACE_BEGIN // Moves data into the L1 cache before it is read, or "prefetches" it. // // The value of `addr` is the address of the memory to prefetch. If // the target and compiler support it, data prefetch instructions are // generated. If the prefetch is done some time before the memory is // read, it may be in the cache by the time the read occurs. // // This method prefetches data with the highest degree of temporal locality; // data is prefetched where possible into all levels of the cache. // // Incorrect or gratuitous use of this function can degrade performance. // Use this function only when representative benchmarks show an improvement. // // Example: // // // Computes incremental checksum for `data`. // int ComputeChecksum(int sum, absl::string_view data); // // // Computes cumulative checksum for all values in `data` // int ComputeChecksum(absl::Span data) { // int sum = 0; // auto it = data.begin(); // auto pit = data.begin(); // auto end = data.end(); // for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) { // absl::PrefetchToLocalCache(pit->data()); // } // for (; pit != end; ++pit, ++it) { // sum = ComputeChecksum(sum, *it); // absl::PrefetchToLocalCache(pit->data()); // } // for (; it != end; ++it) { // sum = ComputeChecksum(sum, *it); // } // return sum; // } // void PrefetchToLocalCache(const void* addr); // Moves data into the L1 cache before it is read, or "prefetches" it. // // This function is identical to `PrefetchToLocalCache()` except that it has // non-temporal locality: the fetched data should not be left in any of the // cache tiers. This is useful for cases where the data is used only once / // short term, for example, invoking a destructor on an object. // // Incorrect or gratuitous use of this function can degrade performance. // Use this function only when representative benchmarks show an improvement. // // Example: // // template // void DestroyPointers(Iterator begin, Iterator end) { // size_t distance = std::min(8U, bars.size()); // // int dist = 8; // auto prefetch_it = begin; // while (prefetch_it != end && --dist;) { // absl::PrefetchToLocalCacheNta(*prefetch_it++); // } // while (prefetch_it != end) { // delete *begin++; // absl::PrefetchToLocalCacheNta(*prefetch_it++); // } // while (begin != end) { // delete *begin++; // } // } // void PrefetchToLocalCacheNta(const void* addr); // Moves data into the L1 cache with the intent to modify it. // // This function is similar to `PrefetchToLocalCache()` except that it // prefetches cachelines with an 'intent to modify' This typically includes // invalidating cache entries for this address in all other cache tiers, and an // exclusive access intent. // // Incorrect or gratuitous use of this function can degrade performance. As this // function can invalidate cached cachelines on other caches and computer cores, // incorrect usage of this function can have an even greater negative impact // than incorrect regular prefetches. // Use this function only when representative benchmarks show an improvement. // // Example: // // void* Arena::Allocate(size_t size) { // void* ptr = AllocateBlock(size); // absl::PrefetchToLocalCacheForWrite(ptr); // return ptr; // } // void PrefetchToLocalCacheForWrite(const void* addr); #if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__) #define ABSL_HAVE_PREFETCH 1 // See __builtin_prefetch: // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html. // ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache( const void* addr) { __builtin_prefetch(addr, 0, 3); } ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta( const void* addr) { __builtin_prefetch(addr, 0, 0); } ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite( const void* addr) { // [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1) // unless -march=broadwell or newer; this is not generally the default, so we // manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel // processors and has been present on AMD processors since the K6-2. #if defined(__x86_64__) && !defined(__PRFCHW__) asm("prefetchw %0" : : "m"(*reinterpret_cast(addr))); #else __builtin_prefetch(addr, 1, 3); #endif } #elif defined(ABSL_INTERNAL_HAVE_SSE) #define ABSL_HAVE_PREFETCH 1 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache( const void* addr) { _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0); } ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta( const void* addr) { _mm_prefetch(reinterpret_cast(addr), _MM_HINT_NTA); } ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite( const void* addr) { #if defined(_MM_HINT_ET0) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_ET0); #elif !defined(_MSC_VER) && defined(__x86_64__) // _MM_HINT_ET0 is not universally supported. As we commented further // up, PREFETCHW is recognized as a no-op on older Intel processors // and has been present on AMD processors since the K6-2. We have this // disabled for MSVC compilers as this miscompiles on older MSVC compilers. asm("prefetchw %0" : : "m"(*reinterpret_cast(addr))); #endif } #else ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache( const void* addr) {} ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta( const void* addr) {} ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite( const void* addr) {} #endif ABSL_NAMESPACE_END } // namespace absl #endif // ABSL_BASE_PREFETCH_H_