/* Copyright 2018 The Chromium Authors. All rights reserved. * Use of this source code is governed by a BSD-style license that can be * found in the Chromium source repository LICENSE file. */ #ifndef __SLIDE_HASH__NEON__ #define __SLIDE_HASH__NEON__ #include "deflate.h" #include inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash, const uInt hash_size, const ush w_size) { /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits * values. For further details, check: * ARM DHT 0002A, section 1.3.2 NEON Registers. */ const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t); /* Unrolling the operation yielded a compression performance boost in both * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4 * content. For full benchmarking data, check: http://crbug.com/863257. */ const size_t stride = 2*chunk; const uint16x8_t v = vdupq_n_u16(w_size); for (Posf *end = hash + hash_size; hash != end; hash += stride) { uint16x8_t m_low = vld1q_u16(hash); uint16x8_t m_high = vld1q_u16(hash + chunk); /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero, * replacing the ternary operator expression in the original code: * (m >= wsize ? m - wsize : NIL). */ m_low = vqsubq_u16(m_low, v); m_high = vqsubq_u16(m_high, v); vst1q_u16(hash, m_low); vst1q_u16(hash + chunk, m_high); } } inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev, const unsigned short w_size, const uInt hash_size) { /* * SIMD implementation for hash table rebase assumes: * 1. hash chain offset (Pos) is 2 bytes. * 2. hash table size is multiple of 32 bytes. * #1 should be true as Pos is defined as "ush" * #2 should be true as hash_bits are greater than 7 */ const size_t size = hash_size * sizeof(head[0]); Assert(sizeof(Pos) == 2, "Wrong Pos size."); Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error."); neon_slide_hash_update(head, hash_size, w_size); #ifndef FASTEST neon_slide_hash_update(prev, w_size, w_size); #endif } #endif