// Copyright (C) 2024 The Software Heritage developers // See the AUTHORS file at the top-level directory of this distribution // License: GNU General Public License version 3, or any later version // See top-level LICENSE file for more information /* * This example is translated from https://archive.softwareheritage.org/swh:1:cnt:3b8644b100f586474f43335e5f057d4c82dcb225;origin=https://github.com/jermp/pthash;visit=swh:1:snp:86eb28469f3ee79401a63b464948e17c210bdb37;anchor=swh:1:rev:28aedcb03af096c0e9988b1ad240da7b4cf010d7;path=/src/example.cpp * which is distributed under the following license: * * Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ use std::collections::HashSet; use std::time::Instant; use anyhow::{Context, Result}; use rand::prelude::*; use pthash::{ BuildConfiguration, DictionaryDictionary, Hashable, Minimal, MurmurHash2_64, Phf, SinglePhf, }; fn main() { if let Err(e) = main_() { eprintln!("Error: {}", e); std::process::exit(1); } } fn main_() -> Result<()> { stderrlog::new() .verbosity(2) .timestamp(stderrlog::Timestamp::Second) .init() .expect("Could not initialize stderrlog"); /* Generate 10M random 64-bit keys as input data. */ let num_keys: usize = 10000000; let seed: u64 = 1234567890; log::info!("generating input data..."); let mut keys = HashSet::::with_capacity(num_keys); let mut rng = StdRng::seed_from_u64(seed); while keys.len() < num_keys { keys.insert(rng.gen()); } let mut keys: Vec<_> = keys.into_iter().collect(); // iterating on a HashSet does not have a guaranteed order, and we want determinism keys.sort(); keys.shuffle(&mut rng); /* Set up a build configuration. */ let temp_dir = tempfile::tempdir()?; let mut config = BuildConfiguration::new(temp_dir.path().to_owned()); config.c = 6.0; config.alpha = 0.94; // config.minimal_output = true; // unlike the C++ API, this is implicit from f's type config.verbose_output = true; /* Declare the PTHash function. */ let mut f = SinglePhf::::new(); // config.num_partitions = 50; // config.num_threads = 4; // let f = PartitionedPhf_Minimal::::new() /* Build the function in internal memory. */ log::info!("building the function..."); let start = Instant::now(); let timings = f .build_in_internal_memory_from_bytes(&keys, &config) .context("Could not build PHF")?; // let timings = f.build_in_external_memory(keys, config); log::info!("function built in {} seconds", start.elapsed().as_secs()); let total_seconds = timings.partitioning_seconds + timings.mapping_ordering_seconds + timings.searching_seconds + timings.encoding_seconds; log::info!("computed: {} seconds", total_seconds.as_secs_f64()); /* Compute and print the number of bits spent per key. */ let bits_per_key = (f.num_bits() as f64) / (f.num_keys() as f64); log::info!("function uses {} [bits/key]", bits_per_key); /* Sanity check! */ check(&keys, &f)?; log::info!("EVERYTHING OK!"); /* Now evaluate f on some keys. */ for i in 0..10 { log::info!("f({}) = {}", keys[i], f.hash(keys[i])); } /* Serialize the data structure to a file. */ log::info!("serializing the function to disk..."); let output_path = temp_dir.path().join("pthash.bin"); f.save(&output_path).context("Could not write PHF")?; log::info!("reading the function from disk..."); { /* Now reload from disk and query. */ let other = SinglePhf::::load( &output_path, ) .context("Could not read PHF")?; for i in 0..10 { log::info!("f({}) = {}", keys[i], other.hash(keys[i])); assert_eq!(f.hash(keys[i]), other.hash(keys[i])); } } Ok(()) }