#![allow(unused_variables)] #![allow(non_snake_case)] #![allow(non_camel_case_types)] #![allow(unused_parens)] #![allow(unused_assignments)] #![allow(unused_mut)] #![allow(unused_imports)] #![allow(dead_code)] // BWT + RLE Compression. pub struct compactstr { chrs : [u8; N], olen : usize, // original length clen : usize, // compact length, including a zero // eof : usize, or, position of first zero } // compactstr impl compactstr { pub fn new() -> Self { compactstr { chrs : [0;N], olen : 0, clen : 0, } } fn repeats(&self) -> usize { let mut ax = 0; let mut counts = [0u8;N]; for i in 0..self.clen { let uc = self.chrs[i] as usize; if uc != 0 { ax += counts[uc] as usize; counts[uc] += 1; } }//for ax }// counts the number of character repetitions found fn rle(s:&str) -> Self { let bytes = s.as_bytes(); let blen = bytes.len(); let mut compact = compactstr::new(); if blen==0 {return compact;} compact.chrs[0] = bytes[0]; compact.chrs[1] = 1; // at least one of these bytes let mut ci = 1; // indexes compact.chrs for bi in 1..blen { // bi indexes bytes if bytes[bi] == bytes[bi-1] { compact.chrs[ci] += 1; } else if ci+2 >= N {break;} else { compact.chrs[ci+1] = bytes[bi]; compact.chrs[ci+2] = 1; ci += 2; } } //for compact }//rle compression, character followed by instances fn inverse_rle(&self) -> [u8;M] { // truncates let mut ax = [0u8;M]; let mut i = 0; // indexes self.chrs let mut k = 0; // indexes ax while i+1 usize { let bytes = s.as_bytes(); let mut ax = 0; let mut counts = [0u8;256]; for i in 0..s.len() { let uc = bytes[i] as usize; ax += counts[uc] as usize; counts[uc] += 1; }//for ax }//count_repeats /// calculates the *compression ratio* of a string as the number of repeated /// characters divided by its length. pub fn compression_ratio(s:&str) -> f32 { let reps = count_repeats(s) as f32; if s.len()==0 {0.0} else { reps / (s.len() as f32) } } /* // first attempt, uses type vector plus direct recursion ///// SA-IS algorithm (Nong), false for S, true for L ///// Also identifies locations of LMS substrings as (usize:usize) ///// where the second index is the last position in SA of the bucket for ///// the char, and the first index allows us to fill the bucket from ///// left to right. fn construct_vector(t:&[u8], len:usize) -> ([bool;M], [(usize,usize);M], usize) { assert!(len0 { counts[t[i-1] as usize] +=1; // determine types[i-1] if t[i-1] > t[i] || (t[i-1]==t[i] && types[i]) { types[i-1] = true; } // else stays false by default (S) if !types[i-1] && !types[i] { // if SS lmsend = i; // one past i-1 } else if !types[i-1] && types[i] { // if SL, found new LMS substring LMS[k] = (i,lmsend); k += 1; lmsend = i; // LMS substring starts with the L suffix } else if types[i-1] && !types[i] {// LS, found LMS char // bucket sort the LMS chars - store where? - chars could be same?? } //else if (LL) do nothing i -= 1; }//while i>0 (types,LMS,k) }//construct_type_vector // this runs in O(n) by memoization of types vector */ // Section 3.2 // K is always 256 fn rename(t:&mut [u8;M], sa:&mut [usize;M],n:usize,) -> [usize;M] { assert!(M>=256 && n0 { sa[k] = sum; sum += counts[k]; if sum>=n {break;} } }//prefix sum loop, meaning of counts[k] changed // First loop will assume that all chars are type L, will change later for bi in 0..n { //set tp[bi] to be start index of tp[bi] char's bucket tp[bi] = sa[t[bi] as usize]; } // each char is also first char of a unique suffix // can inferr the "buckets" array from counts // Second loop will change S type chars to be index of end of buckets let mut chartype = false; // previous type of sentinel, false=S, true=L let mut i = n-1; tp[n-1] = 0; //sa[t[n-1] as usize]; // + counts[t[i] as usize] - 1; while i>0 { chartype = t[i-1]>t[i] || (t[i-1]==t[i] && chartype); if !chartype { // t[i-1] is of S type tp[i-1] = sa[t[i-1] as usize] + counts[t[i-1] as usize] - 1; } i -= 1; }// while bi>0 // next step : sort all LMS characters. left-most S type chars // Goal is to place the correct, sorted index of each LMS char // at the end of the corresponding bucket in SA. //Right now, sa stores the starting location of buckets by byte value // We'll do it using more memory at first. // 3.3 2, (1) use SAEntry::*; let mut SA = [Counter(0);M]; SA[0] = Index(n-1); let mut prevtype = false; // type of sentinel chartype = false; i = n-1; let mut lmschars = 0; // counter while i>0 { prevtype = t[i-1]>t[i] || (t[i-1]==t[i] && chartype); if !chartype && prevtype { //found LMS char at position i (tp[i] is LMS) match SA[tp[i]] { Counter(n) => { SA[tp[i]] = Counter(n+1); }, _ => {}, }//match lmschars += 1; }//found LMS char chartype = prevtype; i -= 1; }//while i>0 // (2) chartype = false; // type of sentinel i = n-1; while i>0 { prevtype = t[i-1]>t[i] || (t[i-1]==t[i] && chartype); if !chartype && prevtype { //found LMS char at position i (tp[i] is LMS) match SA[tp[i]] { Counter(1) => { SA[tp[i]] = Index(i); } Counter(count) if count>1 => { SA[tp[i]-count+1] = Index(i); SA[tp[i]] = Counter(count-1); }, _ => {}, }//match }//found LMS char chartype = prevtype; i -= 1; }//while i>0 println!("SA: {:?}",&SA[..n]); // 3.4 induce sort all LMS-substrings (prefixes) from sorted LMS chars i = n-1; // this is S type chartype = false; while i>0 { chartype = t[i-1]>t[i] || (t[i-1]==t[i] && chartype); if chartype { // t[i-1] is L-type if let Counter(c) = SA[tp[i]] { SA[tp[i]] = Counter(c+1); } } i -= 1; }//while i SA[0] = Index(n-1); for i in 1..n { // scan SA left to right, SA[0] already correct match SA[i] { Index(ind) => { let j = ind-1; //determine if suf(j) is L-type if j+1t[j+1] || (t[j]==t[j+1] && !SA[j].is_index()) { //L let mut lfentry = tp[j]; while let Index(_) = SA[lfentry] {lfentry += 1;} // expensive! SA[lfentry] = Index(j); } }, Counter(1) => { // unique SA[tp[i]] = Index(i); }, Counter(c) if c>1 => { SA[tp[i] + c-1] = Index(i); SA[i] = Counter(c-1); }, _ => {}, // ignore empty entries }//match } // for first n entries in SA tp }//rename // uses more memory than paper specifies. - reuse t? // brute force, use separate type array fn main() { let t0 = "2113311331210".as_bytes(); let n= t0.len(); let mut t = [0u8;256]; t[0..n].copy_from_slice(&t0); let mut sa = [0usize;256]; let tp = rename(&mut t, &mut sa, 13); println!("Tprime: {:?}",&tp[..n]); }//main /////////////////// // bkt (buckets) entry are (start of LMS in T, end of LMS in T)), index is // the first byte of the LMS char at the end of LMS. #[derive(PartialEq,Eq,Copy,Clone,Debug)] enum SAEntry { Index(usize), Counter(usize), // Counter(0) is empty, Counter(1) is unique }//SAEntry impl Default for SAEntry { fn default() -> Self { SAEntry::Counter(0) } }//default for SAEntry impl SAEntry { fn is_index(&self) -> bool { if let &SAEntry::Index(_) = self {true} else {false} } fn is_empty(&self) -> bool { if let &SAEntry::Counter(0) = self {true} else {false} } } /* Counter(1) => { SA[tp[i]] = Index(i); }, Counter(count) if count>1 && tp[i]>0 && SA[tp[i]-1]==Counter(0) => { if tp[i]>1 && SA[tp[i]-2].empty() { SA[tp[i]-2] = Index(i); } else { SA[tp[i]] = Index(i); SA[tp[i]-1] = Counter(0); } }, Counter(count) if count>1 && tp[i]>0 => { //(3) SA[tp[i]-1] is counter match SA[tp[i]-1] { Counter(c) if c>0 && tp[i]>=c+2 => { if let Counter(0) = SA[tp[i]-c-2] { SA[tp[i]-c-2] = Index(i); SA[tp[i]-1] = Counter(c+1); } else { // reaching another bucket // shift?? } }, }//match } */ // if multi, just insert one by one