// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "text_processing.h" #include #include namespace chrome_lang_id { namespace CLD2 { namespace { static const int kMaxSpaceScan = 32; // Bytes int minint(int a, int b) { return (a < b) ? a : b; } // Counts number of spaces; a little faster than one-at-a-time // Doesn't count odd bytes at end int CountSpaces4(const char *src, int src_len) { int s_count = 0; for (int i = 0; i < (src_len & ~3); i += 4) { s_count += (src[i] == ' '); s_count += (src[i + 1] == ' '); s_count += (src[i + 2] == ' '); s_count += (src[i + 3] == ' '); } return s_count; } // This uses a cheap predictor to get a measure of compression, and // hence a measure of repetitiveness. It works on complete UTF-8 characters // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly // all the time when done with a byte-based count. Sigh. // // To allow running prediction across multiple chunks, caller passes in current // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. // // Returns the number of *bytes* correctly predicted, increments by 1..4 for // each correctly-predicted character. // // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text // // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) { typedef unsigned char uint8; int p_count = 0; const uint8 *src = reinterpret_cast(isrc); const uint8 *srclimit = src + src_len; int local_hash = *hash; while (src < srclimit) { int c = src[0]; int incr = 1; // Pick up one char and length if (c < 0xc0) { // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx // Do nothing more } else if ((c & 0xe0) == 0xc0) { // Two-byte c = (c << 8) | src[1]; incr = 2; } else if ((c & 0xf0) == 0xe0) { // Three-byte c = (c << 16) | (src[1] << 8) | src[2]; incr = 3; } else { // Four-byte c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; incr = 4; } src += incr; int p = tbl[local_hash]; // Prediction tbl[local_hash] = c; // Update prediction if (c == p) { p_count += incr; // Count bytes of good predictions } local_hash = ((local_hash << 4) ^ c) & 0xfff; } *hash = local_hash; return p_count; } // Backscan to word boundary, returning how many bytes n to go back // so that src - n is non-space ans src - n - 1 is space. // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary int BackscanToSpace(const char *src, int limit) { int n = 0; limit = minint(limit, kMaxSpaceScan); while (n < limit) { if (src[-n - 1] == ' ') { return n; } // We are at _X ++n; } n = 0; while (n < limit) { if ((src[-n] & 0xc0) != 0x80) { return n; } // We are at char begin ++n; } return 0; } // Forwardscan to word boundary, returning how many bytes n to go forward // so that src + n is non-space ans src + n - 1 is space. // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary int ForwardscanToSpace(const char *src, int limit) { int n = 0; limit = minint(limit, kMaxSpaceScan); while (n < limit) { if (src[n] == ' ') { return n + 1; } // We are at _X ++n; } n = 0; while (n < limit) { if ((src[n] & 0xc0) != 0x80) { return n; } // We are at char begin ++n; } return 0; } } // namespace // Must be exactly 4096 for cheap compressor. static const int kPredictionTableSize = 4096; static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks static const int kSpacesThreshPercent = 30; // Squeeze if >=30% spaces static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted // Remove portions of text that have a high density of spaces, or that are // overly repetitive, squeezing the remaining text in-place to the front of the // input buffer. // // Squeezing looks at density of space/prediced chars in fixed-size chunks, // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. // // Return the new, possibly-shorter length // // Result Buffer ALWAYS has leading space and trailing space space space NUL, // if input does // int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) { char *src = isrc; char *dst = src; char *srclimit = src + src_len; bool skipping = false; int hash = 0; // Allocate local prediction table. int *predict_tbl = new int[kPredictionTableSize]; memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); int chunksize = ichunksize; if (chunksize == 0) { chunksize = kChunksizeDefault; } int space_thresh = (chunksize * kSpacesThreshPercent) / 100; int predict_thresh = (chunksize * kPredictThreshPercent) / 100; while (src < srclimit) { int remaining_bytes = srclimit - src; int len = minint(chunksize, remaining_bytes); // Make len land us on a UTF-8 character boundary. // Ah. Also fixes mispredict because we could get out of phase // Loop always terminates at trailing space in buffer while ((src[len] & 0xc0) == 0x80) { ++len; } // Move past continuation bytes int space_n = CountSpaces4(src, len); int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { // Skip the text if (!skipping) { // Keeping-to-skipping transition; do it at a space int n = BackscanToSpace(dst, static_cast(dst - isrc)); dst -= n; if (dst == isrc) { // Force a leading space if the first chunk is deleted *dst++ = ' '; } skipping = true; } } else { // Keep the text if (skipping) { // Skipping-to-keeping transition; do it at a space int n = ForwardscanToSpace(src, len); src += n; remaining_bytes -= n; // Shrink remaining length len -= n; skipping = false; } // "len" can be negative in some cases if (len > 0) { memmove(dst, src, len); dst += len; } } src += len; } if ((dst - isrc) < (src_len - 3)) { // Pad and make last char clean UTF-8 by putting following spaces dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; dst[3] = '\0'; } else if ((dst - isrc) < src_len) { // Make last char clean UTF-8 by putting following space off the end dst[0] = ' '; } // Deallocate local prediction table delete[] predict_tbl; return static_cast(dst - isrc); } } // namespace CLD2 } // namespace chrome_lang_id