| Crates.io | bytepunch-rs |
| lib.rs | bytepunch-rs |
| version | 0.1.0 |
| created_at | 2025-12-22 21:47:57.956118+00 |
| updated_at | 2025-12-22 21:47:57.956118+00 |
| description | Profile-aware semantic compression for structured documents (CML and beyond) |
| homepage | |
| repository | https://github.com/Blackfall-Labs/bytepunch-rs |
| max_upload_size | |
| id | 2000319 |
| size | 59,287 |
Byte Punch is a lossless compression library designed specifically for CML (Content Markup Language) documents. It uses profile-aware semantic tokenization to achieve 40-70% compression ratios while maintaining 100% fidelity.
Byte Punch provides:
use byte_punch::{Compressor, Decompressor, Dictionary};
// Load profile dictionary
let dict = Dictionary::from_file("dictionaries/code-api.json")?;
// Create compressor
let compressor = Compressor::new(dict.clone());
// Compress
let cml_xml = r#"<cml version="0.1" encoding="utf-8" profile="code:api">...</cml>"#;
let compressed = compressor.compress(cml_xml)?;
println!("Original: {} bytes", cml_xml.len());
println!("Compressed: {} bytes", compressed.len());
println!("Ratio: {:.1}%", (compressed.len() as f64 / cml_xml.len() as f64) * 100.0);
// Decompress
let decompressor = Decompressor::new(dict);
let decompressed = decompressor.decompress(&compressed)?;
assert_eq!(cml_xml, decompressed); // Perfect fidelity!
Byte Punch uses a multi-level tokenization strategy:
| Encoding | Token Size | Capacity | Use Case |
|---|---|---|---|
| UTF-8 | 2-byte | 256 tokens | Core elements, common attributes |
| UTF-16 | 4-byte | 65,536 tokens | Profile-specific elements |
| UTF-32 | 8-byte | 4B tokens | Common phrases, boilerplate |
Input CML → Tokenizer → Replace with tokens → Compressed binary
| |
v v
<cml version="0.1"> [0xF001][0xF002]...
Each profile has a JSON dictionary defining token mappings:
{
"profile": "code:api",
"version": "1.0",
"encoding": "utf-8",
"2-byte-tokens": {
"0xF001": "pub",
"0xF002": "fn",
"0xF003": "struct",
...
},
"4-byte-tokens": {
"0xFFFF0001": "public",
"0xFFFF0002": "function",
...
},
"8-byte-tokens": {
"0xFFFFFFFFFFFF0001": "A contiguous growable array",
...
}
}
Target: 50-60% compression
Optimized for:
pub, fn, struct, enum, implVec, String, Option, ResultExample:
let dict = Dictionary::from_file("dictionaries/code-api.json")?;
let compressor = Compressor::new(dict);
let cml = r#"
<cml version="0.1" encoding="utf-8" profile="code:api">
<body>
<struct id="std.vec.Vec" name="Vec">
<method name="push">
<signature>pub fn push(&mut self, value: T)</signature>
</method>
</struct>
</body>
</cml>
"#;
let compressed = compressor.compress(cml)?;
// Typical ratio: 50-60%
Target: 60-70% compression
Optimized for:
Example:
let dict = Dictionary::from_file("dictionaries/legal-constitution.json")?;
let compressor = Compressor::new(dict);
let cml = r#"
<cml version="0.1" encoding="utf-8" profile="legal:constitution">
<body>
<article num="I" title="Legislative Branch" id="article-1">
<section num="1" id="article-1-section-1">
<clause num="1" id="article-1-section-1-clause-1">
All legislative Powers herein granted shall be vested in a Congress...
</clause>
</section>
</article>
</body>
</cml>
"#;
let compressed = compressor.compress(cml)?;
// Typical ratio: 60-70%
Target: 50-60% compression
Optimized for:
Example:
let dict = Dictionary::from_file("dictionaries/bookstack-wiki.json")?;
let compressor = Compressor::new(dict);
let cml = r#"
<cml version="0.1" encoding="utf-8" profile="bookstack:wiki">
<body>
<book id="book-1" title="Rust Guide">
<chapter id="ch-1" title="Getting Started">
<page id="page-1" title="Setup">
<content format="markdown"><![CDATA[
# Development Environment Setup
...
]]></content>
</page>
</chapter>
</book>
</body>
</cml>
"#;
let compressed = compressor.compress(cml)?;
// Typical ratio: 50-60%
┌─────────────────────────────────────────────────────┐
│ Header (64 bytes) │
├─────────────────────────────────────────────────────┤
│ Magic: 0x42 0x50 0x43 0x4D (BPCM) │
│ Version: 0x01 0x00 │
│ Profile: UTF-8 string (32 bytes) │
│ Dictionary Hash: SHA-256 (32 bytes) │
├─────────────────────────────────────────────────────┤
│ Token Table (variable) │
├─────────────────────────────────────────────────────┤
│ Entry count: u32 │
│ Entries: [token_id, original_length, token_value] │
├─────────────────────────────────────────────────────┤
│ Compressed Content (variable) │
└─────────────────────────────────────────────────────┘
pub struct Compressor {
dictionary: Dictionary,
}
impl Compressor {
pub fn new(dictionary: Dictionary) -> Self;
pub fn compress(&self, input: &str) -> Result<Vec<u8>, CompressionError>;
pub fn stats(&self, compressed: &[u8]) -> CompressionStats;
}
pub struct CompressionStats {
pub original_size: usize,
pub compressed_size: usize,
pub ratio: f64,
pub tokens_used: usize,
}
pub struct Decompressor {
dictionary: Dictionary,
}
impl Decompressor {
pub fn new(dictionary: Dictionary) -> Self;
pub fn decompress(&self, compressed: &[u8]) -> Result<String, DecompressionError>;
pub fn validate_header(&self, compressed: &[u8]) -> Result<Header, DecompressionError>;
}
pub struct Header {
pub magic: [u8; 4],
pub version: [u8; 2],
pub profile: String,
pub dictionary_hash: [u8; 32],
}
pub struct Dictionary {
pub profile: String,
pub version: String,
pub encoding: String,
pub tokens_2byte: HashMap<String, String>,
pub tokens_4byte: HashMap<String, String>,
pub tokens_8byte: HashMap<String, String>,
}
impl Dictionary {
pub fn from_file(path: &str) -> Result<Self, DictionaryError>;
pub fn from_json(json: &str) -> Result<Self, DictionaryError>;
pub fn hash(&self) -> [u8; 32];
}
use byte_punch::{CompressionError, DecompressionError};
// Compression errors
match compressor.compress(input) {
Ok(compressed) => { /* ... */ },
Err(CompressionError::InvalidInput(msg)) => {
eprintln!("Invalid input: {}", msg);
},
Err(CompressionError::DictionaryMismatch { expected, got }) => {
eprintln!("Dictionary mismatch: expected {}, got {}", expected, got);
},
}
// Decompression errors
match decompressor.decompress(compressed) {
Ok(decompressed) => { /* ... */ },
Err(DecompressionError::InvalidMagic) => {
eprintln!("Not a Byte Punch compressed file");
},
Err(DecompressionError::UnsupportedVersion(v)) => {
eprintln!("Unsupported version: {}", v);
},
Err(DecompressionError::ProfileMismatch { expected, got }) => {
eprintln!("Profile mismatch: expected {}, got {}", expected, got);
},
}
# Run all tests
cargo test -p byte-punch
# Run with output
cargo test -p byte-punch -- --nocapture
# Run specific test
cargo test -p byte-punch test_roundtrip
# Run integration tests
cargo test -p byte-punch --test cml_integration_test
Test Coverage:
# Run benchmarks
cargo bench -p byte-punch
# Example results on modern CPU:
# compress_1mb_code: ~80ms
# decompress_1mb_code: ~60ms
# compress_1mb_legal: ~75ms
# decompress_1mb_legal: ~55ms
# Count common words in your CML documents
grep -o '<[^>]*>' *.cml | sort | uniq -c | sort -nr | head -50
{
"profile": "my-profile",
"version": "1.0",
"encoding": "utf-8",
"description": "Custom dictionary for my-profile",
"target_compression": "50-60%",
"2-byte-tokens": {
"0xF001": "most-common-word",
"0xF002": "second-most-common",
...
},
"4-byte-tokens": {
"0xFFFF0001": "common-phrase",
...
},
"8-byte-tokens": {
"0xFFFFFFFFFFFF0001": "very-long-common-phrase",
...
}
}
let dict = Dictionary::from_file("dictionaries/my-profile.json")?;
let compressor = Compressor::new(dict);
let stats = compressor.stats(&compressed);
println!("Compression ratio: {:.1}%", stats.ratio * 100.0);
Adjust token assignments based on compression stats.
Byte Punch is designed to work seamlessly with SAM CML:
use sam_cml::{CmlParser, CmlGenerator};
use byte_punch::{Compressor, Dictionary};
// Parse CML
let cml = CmlParser::parse_cml(&xml)?;
// Generate XML
let generator = CmlGenerator;
let xml = generator.generate_cml(&cml)?;
// Compress
let dict = Dictionary::from_file("dictionaries/code-api.json")?;
let compressor = Compressor::new(dict);
let compressed = compressor.compress(&xml)?;
// Store compressed binary
std::fs::write("output.cml.bp", compressed)?;
Same input + dictionary = same output (always)
let compressed1 = compressor.compress(input)?;
let compressed2 = compressor.compress(input)?;
assert_eq!(compressed1, compressed2);
Decompress, edit, recompress:
let decompressed = decompressor.decompress(&compressed)?;
let edited = decompressed.replace("old", "new");
let recompressed = compressor.compress(&edited)?;
Dictionaries are versioned, diffs are meaningful:
# Dictionary changes are tracked
git diff dictionaries/code-api.json
# Compressed files can be versioned
git add docs/api.cml.bp
git commit -m "Update API docs"
let compressor = Compressor::new(dict); // Load once
for file in files {
let compressed = compressor.compress(&file)?; // Reuse
}
let files: Vec<String> = load_files()?;
let compressed: Vec<Vec<u8>> = files
.par_iter() // Use rayon for parallelism
.map(|f| compressor.compress(f))
.collect::<Result<_, _>>()?;
let stats = compressor.stats(&compressed);
if stats.ratio > 0.7 {
eprintln!("Warning: Poor compression ratio: {:.1}%", stats.ratio * 100.0);
}
MIT OR Apache-2.0