pub use inner::TickCounter; #[cfg(target_arch = "x86_64")] mod inner { // The instruction cpuid / rdtsc / rdtscp are used to benchmark // because the execution time of those instruction is very short // so that we get more chance to have thread trying to get the // lock in the same time. // // Cpuid is used to serialize instructions see: //https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf use core::arch::x86_64::{__cpuid, __rdtscp, _rdtsc}; use core::sync::atomic::{compiler_fence, Ordering}; use criterion::black_box; use std::time::{Duration, Instant}; #[derive(Copy, Clone)] pub struct TickCounter(u64, f64); impl TickCounter { pub fn new() -> TickCounter { #![allow(clippy::many_single_char_names)] let n = 10000; let mut arr = vec![]; arr.reserve(n); for _ in 1..1000 { let s = Self::raw_start(); let e = Self::raw_end(); black_box(e - s); } std::thread::yield_now(); for _ in 1..10 { let s = Self::raw_start(); let e = Self::raw_end(); black_box(e - s); } for _ in 0..n { let s = Self::raw_start(); let e = Self::raw_end(); arr.push(e - s); } arr.sort_unstable(); for k in 0..n / 10 { arr[k] = arr[n / 10]; } for k in n - n / 10..n { arr[k] = arr[n - n / 10 - 1]; } let s = arr.iter().fold(0, |cur, v| cur + *v); let zero = s / 10000; // Now estimate the time/tick let n = 200; let mut arr = vec![]; arr.reserve(n); //heat up for _ in 1..100 { Instant::now().elapsed(); } std::thread::yield_now(); for _ in 1..10 { Instant::now().elapsed(); } let mut i = 0; while i < n { let e = Instant::now(); let e0 = black_box(Self::raw_start()); for _ in 0..i + 1 { black_box(Self::raw_start()); black_box(Self::raw_end()); } let e1 = black_box(Self::raw_start()); let y = e.elapsed(); if e1 < e0 { continue; } else { i += 1; } let dx = e1 - e0; let x = if dx > zero { dx - zero } else { 0 }; arr.push((x as u32, y)); } //Regularize let mut arr_1 = vec![]; for v in arr.into_iter() { let v0 = v.0 as f64; arr_1.push((v0, v.1.as_nanos() as f64 / v0)); } //Windsorize arr_1.sort_unstable_by(|a, b| PartialOrd::partial_cmp(&a.1, &b.1).unwrap()); for k in 0..n / 10 { arr_1[k].1 = arr_1[n / 10].1; } for k in n - n / 10..n { arr_1[k].1 = arr_1[n - n / 10 - 1].1; } //the linear function that minimize quadratic error sum goes //through the middle point yeah!! let xm = arr_1.iter().fold(0f64, |v, x| v + x.0); let ym = arr_1.iter().fold(0f64, |v, x| v + (x.0 * x.1)); let ns_per_tick = ym / xm; println!( "Estimated processor frequency: {}", (100f64 / ns_per_tick).round() / 100f64 ); TickCounter(zero, ns_per_tick) } #[inline(always)] pub fn time R>(&self, f: F) -> Option { let s = Self::raw_start(); black_box(f()); let e = Self::raw_end(); if e < s { return None; } let v = (e - s) as f64; let v = (v - self.0 as f64) * self.1; let v = v.round(); if v >= 0f64 { Some(Duration::from_nanos(v as u64)) } else { Some(Duration::from_nanos(0)) } } #[inline(always)] fn raw_start() -> u64 { compiler_fence(Ordering::AcqRel); let r = unsafe { //__cpuid(0); _rdtsc() }; compiler_fence(Ordering::AcqRel); r //let cpuid_ask: u64 = 0; //let high: u64; //let low: u64; //unsafe { // asm!( // "cpuid", // "rdtsc", // out("rdx") high, // inout("rax") cpuid_ask => low, // out("rbx") _, // out("rcx") _, // options(nostack,preserves_flags) // ) //}; //(high << 32) | low } #[inline(always)] fn raw_end() -> u64 { let mut v = 0; compiler_fence(Ordering::AcqRel); let c = unsafe { let c = __rdtscp(&mut v); __cpuid(0); c }; compiler_fence(Ordering::AcqRel); c //let high: u64; //let low: u64; //unsafe { // asm!( // "rdtscp", // "mov {high}, rdx", // "mov {low}, rax", // "mov rax, 0", // "cpuid", // high = out(reg) high, // low = out(reg) low, // out("rax") _, // out("rbx") _, // out("rcx") _, // out("rdx") _, // options(nostack,preserves_flags) // ) //}; //(high << 32) | low } } } #[cfg(not(target_arch = "x86_64"))] mod inner { use criterion::black_box; use std::time::{Duration, Instant}; #[derive(Copy, Clone)] pub struct TickCounter(Duration); impl TickCounter { pub fn new() -> TickCounter { let mut arr = [Duration::from_secs(0); 10000]; for _ in 1..1000 { let s = Self::raw_start(); let e = Self::raw_end(); black_box(e - s); } for v in arr.iter_mut() { let s = Self::raw_start(); let e = Self::raw_end(); *v = e - s; } arr.sort_unstable(); for k in 0..1000 { arr[k] = arr[1000]; } for k in 9000..10000 { arr[k] = arr[8999]; } let s = arr.iter().fold(Duration::from_secs(0), |cur, v| cur + *v); let zero = s / 10000; TickCounter(zero) } #[inline(always)] pub fn time R>(&self, f: F) -> Option { let s = Self::raw_start(); black_box(f()); let e = Self::raw_end(); if e < s { return None; } let v = e - s; if v >= self.0 { Some(v - self.0) } else { Some(Duration::from_nanos(0)) } } #[inline(always)] fn raw_start() -> Instant { Instant::now() } #[inline(always)] fn raw_end() -> Instant { Instant::now() } } }