#!/bin/bash # Performs build and search test on disk and memory indices (parameters are tuned for 100K-1M sized datasets) # All indices and logs will be stored in working_folder after run is complete # To run, create a catalog text file consisting of the following entries # For each dataset, specify the following 5 lines, in a line by line format, and then move on to next dataset # dataset_name[used for save file names] # /path/to/base.bin # /path/to/query.bin # data_type[float/uint8/int8] # metric[l2/mips] if [ "$#" -ne "3" ]; then echo "usage: ./unit_test.sh [build_folder_path] [catalog] [working_folder]" else BUILD_FOLDER=${1} CATALOG1=${2} WORK_FOLDER=${3} mkdir ${WORK_FOLDER} CATALOG="${WORK_FOLDER}/catalog_formatted.txt" sed -e '/^$/d' ${CATALOG1} > ${CATALOG} echo Running unit testing on various files, with build folder as ${BUILD_FOLDER} and working folder as ${WORK_FOLDER} # download all unit test files #iterate over them and run the corresponding test while IFS= read -r line; do DATASET=${line} read -r BASE read -r QUERY read -r TYPE read -r METRIC GT="${WORK_FOLDER}/${DATASET}_gt30_${METRIC}" MEM="${WORK_FOLDER}/${DATASET}_mem" DISK="${WORK_FOLDER}/${DATASET}_disk" MBLOG="${WORK_FOLDER}/${DATASET}_mb.log" DBLOG="${WORK_FOLDER}/${DATASET}_db.log" MSLOG="${WORK_FOLDER}/${DATASET}_ms.log" DSLOG="${WORK_FOLDER}/${DATASET}_ds.log" FILESIZE=`wc -c "${BASE}" | awk '{print $1}'` BUDGETBUILD=`bc <<< "scale=4; 0.0001 + ${FILESIZE}/(5*1024*1024*1024)"` BUDGETSERVE=`bc <<< "scale=4; 0.0001 + ${FILESIZE}/(10*1024*1024*1024)"` echo "=============================================================================================================================================" echo "Running tests on ${DATASET} dataset, ${TYPE} datatype, $METRIC metric, ${BUDGETBUILD} GiB and ${BUDGETSERVE} GiB build and serve budget" echo "=============================================================================================================================================" rm ${DISK}_* #echo "Going to run test on ${BASE} base, ${QUERY} query, ${TYPE} datatype, ${METRIC} metric, saving gt at ${GT}" echo "Computing Groundtruth" #${BUILD_FOLDER}/tests/utils/compute_groundtruth ${TYPE} ${BASE} ${QUERY} 30 ${GT} ${METRIC} > /dev/null ${BUILD_FOLDER}/tests/utils/compute_groundtruth --data_type ${TYPE} --base_file ${BASE} --query_file ${QUERY} --K 30 --gt_file ${GT} --dist_fn ${METRIC} > /dev/null echo "Building Mem Index" # /usr/bin/time ${BUILD_FOLDER}/tests/build_memory_index ${TYPE} ${METRIC} ${BASE} ${MEM} 32 50 1.2 0 > ${MBLOG} /usr/bin/time ${BUILD_FOLDER}/tests/build_memory_index --data_type ${TYPE} --dist_fn ${METRIC} --data_path ${BASE} --index_path_prefix ${MEM} -R 32 -L 50 --alpha 1.2 -T 0 > ${MBLOG} awk '/^Degree/' ${MBLOG} awk '/^Indexing/' ${MBLOG} echo "Searching Mem Index" ${BUILD_FOLDER}/tests/search_memory_index --data_type ${TYPE} --dist_fn ${METRIC} --index_path_prefix ${MEM} -T 16 --query_file ${QUERY} --gt_file ${GT} -K 10 --result_path /tmp/res -L 10 20 30 40 50 60 70 80 90 100 > ${MSLOG} awk '/===/{x=NR+10}(NR<=x){print}' ${MSLOG} echo "Building Disk Index" ${BUILD_FOLDER}/tests/build_disk_index --data_type ${TYPE} --dist_fn ${METRIC} --data_path ${BASE} --index_path_prefix ${DISK} -R 32 -L 50 -B ${BUDGETSERVE} -M ${BUDGETBUILD} -T 32 --PQ_disk_bytes 0 > ${DBLOG} awk '/^Compressing/' ${DBLOG} echo "#shards in disk index" awk '/^Indexing/' ${DBLOG} echo "Searching Disk Index" ${BUILD_FOLDER}/tests/search_disk_index --data_type ${TYPE} --dist_fn ${METRIC} --index_path_prefix ${DISK} --num_nodes_to_cache 10000 -T 10 -W 4 --query_file ${QUERY} --gt_file ${GT} -K 10 --result_path /tmp/res -L 20 40 60 80 100 > ${DSLOG} echo "# shards used during index construction:" awk '/medoids/{x=NR+1}(NR<=x){print}' ${DSLOG} awk '/===/{x=NR+10}(NR<=x){print}' ${DSLOG} done < "${CATALOG}" fi