Crates.io | utf-c |
lib.rs | utf-c |
version | |
source | src |
created_at | 2024-10-30 03:28:35.491824 |
updated_at | 2024-11-20 08:15:09.912278 |
description | A very small and simple compression for short UTF-8 texts |
homepage | |
repository | https://github.com/utf-c/rust |
max_upload_size | |
id | 1427963 |
Cargo.toml error: | TOML parse error at line 22, column 1 | 22 | autolib = false | ^^^^^^^ unknown field `autolib`, expected one of `name`, `version`, `edition`, `authors`, `description`, `readme`, `license`, `repository`, `homepage`, `documentation`, `build`, `resolver`, `links`, `default-run`, `default_dash_run`, `rust-version`, `rust_dash_version`, `rust_version`, `license-file`, `license_dash_file`, `license_file`, `licenseFile`, `license_capital_file`, `forced-target`, `forced_dash_target`, `autobins`, `autotests`, `autoexamples`, `autobenches`, `publish`, `metadata`, `keywords`, `categories`, `exclude`, `include` |
size | 0 |
UTF-C is a compression for short UTF-8 texts with non-ASCII characters.
[!TIP] Use our
helper::only_ascii()
function (If possible together with the SIMD feature) to check if the bytes consist only of ASCII characters and skip compression.
Ζω στην Ευρώπη
In this example, we were able to remove 7 bytes.
Uncompressed(26): [206, 150, 207, 137, 32, 207, 131, 207, 132, 206, 183, 206, 189, 32, 206, 149, 207, 133, 207, 129, 207, 142, 207, 128, 206, 183]
Compressed(19): [206, 150, 207, 137, 32, 131, 132, 206, 183, 189, 32, 149, 207, 133, 129, 142, 128, 206, 183]
私はヨーロッパに住んでいます
In this example, we were able to remove 14 bytes.
Uncompressed(42): [231, 167, 129, 227, 129, 175, 227, 131, 168, 227, 131, 188, 227, 131, 173, 227, 131, 131, 227, 131, 145, 227, 129, 171, 228, 189, 143, 227, 130, 147, 227, 129, 167, 227, 129, 132 227, 129, 190, 227, 129, 153]
Compressed(28): [231, 167, 129, 227, 129, 175, 227, 131, 168, 188, 173, 131, 145, 227, 129, 171, 228, 189, 143, 227, 130, 147, 227, 129, 167, 132, 190, 153]
[!IMPORTANT] Please create your own comparison and check if this compression is suitable for your project!
GzEncoder
, Compression::fast()
and GzDecoder
was used for gzip.# ...
[dependencies]
# ...
flate2 = { version = "1.0.34", features = ["zlib-ng"], default-features = false }
[profile.release]
strip = true # Automatically strip symbols from the binary
opt-level = 3 # Optimize for size
lto = true # Enable link time optimization
codegen-units = 1 # Maximize size reduction optimizations
"Ζω στην Ευρώπη" compression and decompression 50000x
[gzip | compression ] finished after 233.239 µs
[gzip | decompression] finished after 39.512 µs
[utf-c | compression ] finished after 4.274 µs
[utf-c | decompression] finished after 5.818 µs
========== gzip (48) ==========
[31, 139, 8, 0, 0, 0, 0, 0, 4, 255, 59, 55, 237, 124, 167, 194, 249, 230, 243, 45, 231, 182, 159, 219, 171, 112, 110, 234, 249, 214, 243, 141, 231, 251, 206, 55, 156, 219, 14, 0, 107, 59, 158, 137, 26, 0, 0, 0]
========== utf-c (19) ==========
[206, 150, 207, 137, 32, 131, 132, 206, 183, 189, 32, 149, 207, 133, 129, 142, 128, 206, 183]
========== original (26) ==========
[206, 150, 207, 137, 32, 207, 131, 207, 132, 206, 183, 206, 189, 32, 206, 149, 207, 133, 207, 129, 207, 142, 207, 128, 206, 183]
"Ζω στην Ευρώπη 私はヨーロッパに住んでいます ฉนอาศยอยในยโรป" compression and decompression 50000x
[gzip | compression ] finished after 306.908 µs
[gzip | decompression] finished after 82.742 µs
[utf-c | compression ] finished after 12.420 µs
[utf-c | decompression] finished after 13.892 µs
========== gzip (134) ==========
[31, 139, 8, 0, 0, 0, 0, 0, 4, 255, 59, 55, 237, 124, 167, 194, 249, 230, 243, 45, 231, 182, 159, 219, 171, 112, 110, 234, 249, 214, 243, 141, 231, 251, 206, 55, 156, 219, 174, 240, 124, 121, 227, 227, 198, 245, 143, 155, 87, 60, 110, 222, 243, 184, 121, 237, 227, 230, 230, 199, 205, 19, 31, 55, 174, 126, 178, 183, 255, 113, 211, 228, 199, 141, 203, 31, 55, 182, 60, 110, 220, 247, 184, 113, 166, 194, 131, 29, 157, 15, 118, 204, 124, 176, 99, 237, 131, 29, 155, 30, 236, 88, 241, 96, 199, 34, 48, 123, 209, 131, 157, 205, 96, 241, 69, 15, 118, 54, 61, 216, 177, 248, 193, 142, 217, 0, 117, 185, 227, 58, 112, 0, 0, 0]
========== utf-c (73) ==========
[206, 150, 207, 137, 32, 131, 132, 206, 183, 189, 32, 149, 207, 133, 129, 142, 128, 206, 183, 32, 231, 167, 129, 227, 129, 175, 227, 131, 168, 188, 173, 131, 145, 227, 129, 171, 228, 189, 143, 227, 130, 147, 227, 129, 167, 132, 190, 153, 32, 224, 184, 137, 153, 173, 178, 168, 162, 173, 162, 224, 185, 131, 224, 184, 153, 162, 224, 185, 130, 224, 184, 163, 155]
========== original (112) ==========
[206, 150, 207, 137, 32, 207, 131, 207, 132, 206, 183, 206, 189, 32, 206, 149, 207, 133, 207, 129, 207, 142, 207, 128, 206, 183, 32, 231, 167, 129, 227, 129, 175, 227, 131, 168, 227, 131, 188, 227, 131, 173, 227, 131, 131, 227, 131, 145, 227, 129, 171, 228, 189, 143, 227, 130, 147, 227, 129, 167, 227, 129, 132, 227, 129, 190, 227, 129, 153, 32, 224, 184, 137, 224, 184, 153, 224, 184, 173, 224, 184, 178, 224, 184, 168, 224, 184, 162, 224, 184, 173, 224, 184, 162, 224, 185, 131, 224, 184, 153, 224, 184, 162, 224, 185, 130, 224, 184, 163, 224, 184, 155]
"טקסט זה תורגם באמצעות Google Translate לצורך השוואה בין UTF-C ו-GZIP!" compression and decompression 50000x
[gzip | compression ] finished after 305.008 µs
[gzip | decompression] finished after 78.478 µs
[utf-c | compression ] finished after 10.715 µs
[utf-c | decompression] finished after 12.208 µs
===== gzip (124) =====
[31, 139, 8, 0, 0, 0, 0, 0, 4, 255, 187, 62, 227, 250, 242, 235, 11, 175, 207, 80, 184, 62, 237, 250, 20, 133, 235, 171, 174, 79, 189, 190, 226, 250, 164, 235, 115, 21, 174, 79, 188, 62, 225, 250, 188, 235, 203, 174, 47, 186, 62, 245, 250, 42, 5, 247, 252, 252, 244, 156, 84, 133, 144, 162, 196, 188, 226, 156, 196, 146, 84, 133, 235, 115, 174, 47, 3, 43, 158, 165, 112, 125, 202, 245, 149, 215, 167, 94, 159, 122, 125, 2, 200, 136, 137, 215, 103, 94, 159, 175, 16, 26, 226, 166, 235, 172, 112, 125, 170, 174, 123, 148, 103, 128, 34, 0, 169, 163, 170, 30, 102, 0, 0, 0]
===== utf-c (70) =====
[215, 152, 167, 161, 152, 32, 150, 148, 32, 170, 149, 168, 146, 157, 32, 145, 144, 158, 166, 162, 149, 170, 32, 71, 111, 111, 103, 108, 101, 32, 84, 114, 97, 110, 115, 108, 97, 116, 101, 32, 156, 166, 149, 168, 154, 32, 148, 169, 149, 149, 144, 148, 32, 145, 153, 159, 32, 85, 84, 70, 45, 67, 32, 149, 45, 71, 90, 73, 80, 33]
===== original (102) =====
[215, 152, 215, 167, 215, 161, 215, 152, 32, 215, 150, 215, 148, 32, 215, 170, 215, 149, 215, 168, 215, 146, 215, 157, 32, 215, 145, 215, 144, 215, 158, 215, 166, 215, 162, 215, 149, 215, 170, 32, 71, 111, 111, 103, 108, 101, 32, 84, 114, 97, 110, 115, 108, 97, 116, 101, 32, 215, 156, 215, 166, 215, 149, 215, 168, 215, 154, 32, 215, 148, 215, 169, 215, 149, 215, 149, 215, 144, 215, 148, 32, 215, 145, 215, 153, 215, 159, 32, 85, 84, 70, 45, 67, 32, 215, 149, 45, 71, 90, 73, 80, 33]
"הטקסט הזה נדחס עם UTF-C ו-GZIP ולאחר מכן הושווה. טקסט זה תורגם עם Google Translate ואנו מקווים שהוא תורגם כהלכה אך אין ערובה לכך" compression and decompression 100000x
[gzip | compression ] finished after 359.859 µs
[gzip | decompression] finished after 112.994 µs
[utf-c | compression ] finished after 20.419 µs
[utf-c | decompression] finished after 27.969 µs
===== gzip (197) =====
[31, 139, 8, 0, 0, 0, 0, 0, 4, 255, 187, 62, 229, 250, 140, 235, 203, 175, 47, 188, 62, 67, 225, 250, 148, 235, 211, 174, 79, 81, 184, 190, 224, 250, 228, 235, 211, 175, 47, 84, 184, 190, 232, 250, 92, 133, 208, 16, 55, 93, 103, 133, 235, 83, 117, 221, 163, 60, 3, 20, 174, 79, 189, 62, 231, 250, 132, 235, 211, 175, 175, 80, 184, 62, 239, 250, 236, 235, 243, 65, 154, 166, 94, 95, 121, 125, 234, 245, 169, 215, 167, 232, 41, 32, 204, 2, 155, 180, 234, 250, 212, 235, 43, 174, 79, 186, 62, 23, 98, 150, 123, 126, 126, 122, 78, 170, 66, 72, 81, 98, 94, 113, 78, 98, 73, 42, 200, 184, 9, 215, 23, 92, 159, 10, 50, 108, 57, 216, 140, 153, 32, 181, 43, 175, 79, 185, 62, 245, 250, 4, 133, 235, 72, 250, 103, 95, 159, 114, 125, 206, 245, 217, 32, 247, 77, 184, 62, 75, 225, 250, 132, 235, 51, 65, 182, 47, 186, 190, 226, 250, 212, 235, 19, 65, 194, 32, 217, 89, 0, 254, 230, 105, 81, 207, 0, 0, 0]
===== utf-c (129) =====
[215, 148, 152, 167, 161, 152, 32, 148, 150, 148, 32, 160, 147, 151, 161, 32, 162, 157, 32, 85, 84, 70, 45, 67, 32, 149, 45, 71, 90, 73, 80, 32, 149, 156, 144, 151, 168, 32, 158, 155, 159, 32, 148, 149, 169, 149, 149, 148, 46, 32, 152, 167, 161, 152, 32, 150, 148, 32, 170, 149, 168, 146, 157, 32, 162, 157, 32, 71, 111, 111, 103, 108, 101, 32, 84, 114, 97, 110, 115, 108, 97, 116, 101, 32, 149, 144, 160, 149, 32, 158, 167, 149, 149, 153, 157, 32, 169, 148, 149, 144, 32, 170, 149, 168, 146, 157, 32, 155, 148, 156, 155, 148, 32, 144, 154, 32, 144, 153, 159, 32, 162, 168, 149, 145, 148, 32, 156, 155, 154]
===== original (207) =====
[215, 148, 215, 152, 215, 167, 215, 161, 215, 152, 32, 215, 148, 215, 150, 215, 148, 32, 215, 160, 215, 147, 215, 151, 215, 161, 32, 215, 162, 215, 157, 32, 85, 84, 70, 45, 67, 32, 215, 149, 45, 71, 90, 73, 80, 32, 215, 149, 215, 156, 215, 144, 215, 151, 215, 168, 32, 215, 158, 215, 155, 215, 159, 32, 215, 148, 215, 149, 215, 169, 215, 149, 215, 149, 215, 148, 46, 32, 215, 152, 215, 167, 215, 161, 215, 152, 32, 215, 150, 215, 148, 32, 215, 170, 215, 149, 215, 168, 215, 146, 215, 157, 32, 215, 162, 215, 157, 32, 71, 111, 111, 103, 108, 101, 32, 84, 114, 97, 110, 115, 108, 97, 116, 101, 32, 215, 149, 215, 144, 215, 160, 215, 149, 32, 215, 158, 215, 167, 215, 149, 215, 149, 215, 153, 215, 157, 32, 215, 169, 215, 148, 215, 149, 215, 144, 32, 215, 170, 215, 149, 215, 168, 215, 146, 215, 157, 32, 215, 155, 215, 148, 215, 156, 215, 155, 215, 148, 32, 215, 144, 215, 154, 32, 215, 144, 215, 153, 215, 159, 32, 215, 162, 215, 168, 215, 149, 215, 145, 215, 148, 32, 215, 156, 215, 155, 215, 154]