#!/usr/bin/env python3 from itertools import combinations, product import os import shutil import numpy as np from numpy.random import normal from scipy import stats data_dir = 'data' kat_dir = 'kat' means = [1, 1.1, 10] stddevs = [0.1, 1] sizes = [100, 1000] params = list(product(means, stddevs, sizes)) def fmt_path(m, std, n): return os.path.join(data_dir, '{}_{}_{}'.format(m, std, n)) def read(path): return np.fromfile(path, dtype=np.dtype('float64'), sep='\n') def summarize(d): # Use an unbiased estimator of variance and derived values. ddof = 1 # Use the percentile interpolation strategy under test. interpolation = 'linear' return { 'n': len(d), 'min': np.min(d), 'max': np.max(d), 'lower_quartile': np.percentile(d, 25.0, interpolation=interpolation), 'upper_quartile': np.percentile(d, 75.0, interpolation=interpolation), 'median': np.median(d), 'mean': np.mean(d), 'std': np.std(d, ddof=ddof), 'sem': stats.sem(d, ddof=ddof), 'var': np.var(d, ddof=ddof), } def fmt_summary(summary): fmt = lambda k: '{}\t{}'.format(k, summary[k]) return '\n'.join([fmt(k) for k in sorted(summary.keys())]) def write_summary(path, summary): name = os.path.basename(path) kat_name = 'summary_{}'.format(name) kat_path = os.path.join(kat_dir, kat_name) with open(kat_path, 'w') as f: f.write('src\t{}\n'.format(name)) f.write(fmt_summary(summary)) f.write('\n') def t_test(x, y): t, p = stats.ttest_ind(x, y, equal_var=False) return t, p def write_t_test(path1, path2, t, p): name1 = os.path.basename(path1) name2 = os.path.basename(path2) kat_name = 'ttest-{}-{}'.format(name1, name2) kat_path = os.path.join(kat_dir, kat_name) with open(kat_path, 'w') as f: f.write('src1\t{}\n'.format(name1)) f.write('src2\t{}\n'.format(name2)) f.write('t\t{}\n'.format(t)) f.write('p\t{}\n'.format(p)) # See: http://www.itl.nist.gov/div898/handbook/eda/section3/eda3672.htm def make_t_critical_value_table(): MAX_DF = 100 degrees_freedom = [[df] for df in range(1, MAX_DF + 1)] sig_levels_1_sided = [0.100, 0.050, 0.025, 0.010, 0.005, 0.001] sig_levels_2_sided = [a/2.0 for a in sig_levels_1_sided] norm_row = stats.norm.isf(sig_levels_1_sided) t_table = stats.t.isf(sig_levels_2_sided, degrees_freedom) fmt_row = lambda r: ''.join(map(str, [' ', list(r), ','])) print('[') # The t-distribution is approximately normal as d.f. → ∞, so the normal # distribution's critical values will be used to fill index 0 of the # 2-dimensional array we will emit. If 1 <= d.f. <= MAX_DF, then the d.f. # value is exactly the index into the table. Otherwise, we approximate the # t-distribution with the normal, and just use index 0. print(fmt_row(norm_row)) for row in t_table: print(fmt_row(row)) print('];') def make_data(): shutil.rmtree(data_dir, ignore_errors=True) os.makedirs(data_dir, exist_ok=True) for ps in params: d = normal(*ps) path = fmat_path(*ps) d.tofile(path, sep='\n') def make_summary_kats(path): for p in paths: d = read(p) s = summarize(d) write_summary(p, s) def make_t_test_kats(paths): for (p1, p2) in combinations(paths, r=2): d1 = read(p1) d2 = read(p2) t, p = t_test(d1, d2) write_t_test(p1, p2, t, p) def write_lr_data(ps, x, y): data = np.array([x, y]).T x_name = 'lr-{}_{}_{}-x'.format(*ps) x_path = os.path.join(data_dir, x_name) np.savetxt(x_path, x, delimiter='\t', fmt='%.8f') y_name = 'lr-{}_{}_{}-y'.format(*ps) y_path = os.path.join(data_dir, y_name) np.savetxt(y_path, y, delimiter='\t', fmt='%.8f') def write_lr(ps, x, y): slope, intercept, r, p, se = stats.linregress(x, y) lr_name = 'lr-{}_{}_{}'.format(*ps) lr_path = os.path.join(kat_dir, lr_name) with open(lr_path, 'w') as f: f.write('src\t{}\n'.format(lr_name)) f.write('slope\t{}\n'.format(slope)) f.write('intercept\t{}\n'.format(intercept)) f.write('r\t{}\n'.format(r)) f.write('p\t{}\n'.format(p)) f.write('se\t{}\n'.format(se)) def make_lr_kats(): m = 2.0 b = 5.0 params = [ (0, 1, 100), (0, 1, 1000), (1, 5, 1000), ] for ps in params: x = np.random.normal(*ps) y = b + m * np.random.normal(*ps) write_lr_data(ps, x, y) write_lr(ps, x, y) def make_kats(): shutil.rmtree(kat_dir, ignore_errors=True) os.makedirs(kat_dir, exist_ok=True) paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir)] make_summary_kat(paths) make_t_test_kats(paths) make_lr_kats() if __name__ == '__main__': make_kats()