// sort -R sparse_binary.tsv |head -10 > sparse_binary_query_10.tsv // ./jaccard-sparse create -d 100 -D J sparse // ./jaccard-sparse append sparse sparse_binary.tsv // ./jaccard-sparse search sparse sparse_binary_query_10.tsv // #include "NGT/Command.h" using namespace std; void help() { cerr << "Usage : jaccard-sparse command index [data]" << endl; cerr << " command : info create search append" << endl; } void append(NGT::Args &args) { const string usage = "Usage: jaccard-sparse append [-p #-of-thread] [-n data-size] " "index(output) [data.tsv(input)]"; string database; try { database = args.get("#1"); } catch (...) { cerr << "jaccard-sparse: Error: DB is not specified." << endl; cerr << usage << endl; return; } string data; try { data = args.get("#2"); } catch (...) { cerr << "jaccard-sparse: Warning: No specified object file. Just build an index for the existing objects." << endl; } int threadSize = args.getl("p", 50); size_t dataSize = args.getl("n", 0); std::istream *is; std::ifstream *ifs = 0; try { NGT::Index index(database); if (data == "-") { is = &std::cin; } else { ifs = new std::ifstream; ifs->std::ifstream::open(data); if (!(*ifs)) { cerr << "Cannot open the specified data file. " << data << endl; return; } is = ifs; } string line; size_t count = 0; while(getline(*is, line)) { if (dataSize > 0 && count >= dataSize) { break; } count++; vector object; stringstream linestream(line); while (!linestream.eof()) { uint32_t value; linestream >> value; if (linestream.fail()) { object.clear(); break; } object.push_back(value); } if (object.empty()) { std::cerr << "jaccard-sparse: Empty line or invalid value. " << count << ":" << line << std::endl; continue; } } if (data != "-") { delete ifs; } index.createIndex(threadSize); index.saveIndex(database); } catch (NGT::Exception &err) { if (data != "-") { delete ifs; } cerr << "jaccard-sparse: Error " << err.what() << endl; cerr << usage << endl; } return; } void search(NGT::Index &index, NGT::Command::SearchParameters &searchParameters, ostream &stream) { std::ifstream is(searchParameters.query); if (!is) { std::cerr << "Cannot open the specified file. " << searchParameters.query << std::endl; return; } if (searchParameters.outputMode[0] == 'e') { stream << "# Beginning of Evaluation" << endl; } string line; double totalTime = 0; size_t queryCount = 0; double epsilon = searchParameters.beginOfEpsilon; while(getline(is, line)) { if (searchParameters.querySize > 0 && queryCount >= searchParameters.querySize) { break; } vector query; stringstream linestream(line); while (!linestream.eof()) { uint32_t value; linestream >> value; query.push_back(value); } auto sparseQuery = index.makeSparseObject(query); queryCount++; NGT::SearchQuery sc(sparseQuery); NGT::ObjectDistances objects; sc.setResults(&objects); sc.setSize(searchParameters.size); sc.setRadius(searchParameters.radius); if (searchParameters.accuracy > 0.0) { sc.setExpectedAccuracy(searchParameters.accuracy); } else { sc.setEpsilon(epsilon); } sc.setEdgeSize(searchParameters.edgeSize); NGT::Timer timer; switch (searchParameters.indexType) { case 't': timer.start(); index.search(sc); timer.stop(); break; case 'g': timer.start(); index.searchUsingOnlyGraph(sc); timer.stop(); break; case 's': timer.start(); index.linearSearch(sc); timer.stop(); break; } totalTime += timer.time; if (searchParameters.outputMode[0] == 'e') { stream << "# Query No.=" << queryCount << endl; stream << "# Query=" << line.substr(0, 20) + " ..." << endl; stream << "# Index Type=" << searchParameters.indexType << endl; stream << "# Size=" << searchParameters.size << endl; stream << "# Radius=" << searchParameters.radius << endl; stream << "# Epsilon=" << epsilon << endl; stream << "# Query Time (msec)=" << timer.time * 1000.0 << endl; stream << "# Distance Computation=" << sc.distanceComputationCount << endl; stream << "# Visit Count=" << sc.visitCount << endl; } else { stream << "Query No." << queryCount << endl; stream << "Rank\tID\tDistance" << endl; } for (size_t i = 0; i < objects.size(); i++) { stream << i + 1 << "\t" << objects[i].id << "\t"; stream << objects[i].distance << endl; } if (searchParameters.outputMode[0] == 'e') { stream << "# End of Search" << endl; } else { stream << "Query Time= " << timer.time << " (sec), " << timer.time * 1000.0 << " (msec)" << endl; } if (searchParameters.outputMode[0] == 'e') { stream << "# End of Query" << endl; } } if (searchParameters.outputMode[0] == 'e') { stream << "# Average Query Time (msec)=" << totalTime * 1000.0 / (double)queryCount << endl; stream << "# Number of queries=" << queryCount << endl; stream << "# End of Evaluation" << endl; } else { stream << "Average Query Time= " << totalTime / (double)queryCount << " (sec), " << totalTime * 1000.0 / (double)queryCount << " (msec), (" << totalTime << "/" << queryCount << ")" << endl; } } void search(NGT::Args &args) { const string usage = "Usage: ngt search [-i index-type(g|t|s)] [-n result-size] [-e epsilon] [-E edge-size] " "[-m open-mode(r|w)] [-o output-mode] index(input) query.tsv(input)"; string database; try { database = args.get("#1"); } catch (...) { cerr << "jaccard-sparse: Error: DB is not specified" << endl; cerr << usage << endl; return; } NGT::Command::SearchParameters searchParameters(args); try { NGT::Index index(database, searchParameters.openMode == 'r'); search(index, searchParameters, cout); } catch (NGT::Exception &err) { cerr << "jaccard-sparse: Error " << err.what() << endl; cerr << usage << endl; } catch (...) { cerr << "jaccard-sparse: Error" << endl; cerr << usage << endl; } } int main(int argc, char **argv) { NGT::Args args(argc, argv); NGT::Command ngt; string command; try { command = args.get("#0"); } catch(...) { help(); return 0; } try { if (command == "create") { ngt.create(args); } else if (command == "append") { append(args); } else if (command == "search") { search(args); } else { cerr << "jaccard-sparse: Error: Illegal command. " << command << endl; help(); } } catch(NGT::Exception &err) { cerr << "jaccard-sparse: Error: " << err.what() << endl; help(); return 0; } return 0; }