/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Hyperscan pattern benchmarker. * * This program allows users to detect which signatures may be the most * expensive in a set of patterns. It is designed for use with small to medium * pattern set sizes (e.g. 5-500). If used with very large pattern sets it may * take a very long time - the number of recompiles done is g * O(lg2(n)) where * g is the number of generations and n is the number of patterns (assuming * that n >> g). * * This utility will return a cumulative series of removed patterns. The first * generation will find and remove a single pattern. The second generation will * begin with the first pattern removed and find another pattern to remove, * etc. So if we have 100 patterns and 15 generations, the final generation's * score will be a run over 85 patterns. * * This utility is probabilistic. It is possible that the pattern removed in a * generation is not a particularly expensive pattern. To reduce noise in the * results use 'taskset' and set the number of repeats to a level that still * completes in reasonable time (this will reduce the effect of random * measurement noise). * * The criterion for performance can be altered by use of the -C flag where * can be t,r,s,c,b, selecting pattern matching throughput, scratch size, * stream state size (only available in streaming mode), compile time and * bytecode size respectively. * * This utility will also not produce good results if all the patterns are * roughly equally expensive. * * Factor Group Size: * * If there are multiple expensive patterns that are very similar on the * left-hand-side or identical, this utility will typically not find these * groups unless the -F flag is used to search for a group size that is equal * to or larger than the size of the group of similar patterns. * * Otherwise, removing a portion of the similar patterns will have no or almost * no effect, and the search procedure used relies on the ability to remove all * of the similar patterns in at least one search case, something which will * only happen if the factor_group_size is large enough. * * This alters the operation of our tool so that instead of trying to find the * single pattern whose removal has the most effect by binary search (the * default with factor_group_size == 1), we attempt to find the N patterns * whose removal has the most effect by searching over N+1 evenly sized groups, * removing only 1/(N+1) of the search signatures per iteration. * * Note that the number of recompiles done greatly increases with increased * factor group size. For example, with factor_group_size = 1, we do g * 2 * * lg2(n) recompiles, while with factor_group_size = 4, we do g * 4 * * log(5/4)(n). Informally the number of generations we require goes up as we * eliminate a smaller number of signatures and the we have to do more work per * generation. * * * Build instructions: * * g++ -o patbench patbench.cc $(pkg-config --cflags --libs libhs) -lpcap * * Usage: * * ./patbench [ -n repeats] [ -G generations] [ -C criterion ] * [ -F factor_group_size ] [ -N | -S ] * * -n repeats sets the number of times the PCAP is repeatedly scanned * with the pattern * -G generations sets the number of generations that the algorithm is * run for * -N sets non-streaming mode, -S sets streaming mode (default) * -F sets the factor group size (must be >0); this allows the detection * of multiple interacting factors * * -C sets the "criterion", which can be either: * t throughput (the default) - this requires a pcap file * r scratch size * s stream state size * c compile time * b bytecode size * * We recommend the use of a utility like 'taskset' on multiprocessor hosts to * lock execution to a single processor: this will remove processor migration * by the scheduler as a source of noise in the results. * */ #include #include #include #include #include #include #include #include #include #include #include #include // We use the BSD primitives throughout as they exist on both BSD and Linux. #define __FAVOR_BSD #include #include #include #include #include #include #include #include #include #include using std::cerr; using std::cout; using std::endl; using std::ifstream; using std::string; using std::unordered_map; using std::vector; using std::set; using std::min; using std::max; using std::copy; using std::random_device; using std::mt19937; enum Criterion { CRITERION_THROUGHPUT, CRITERION_BYTECODE_SIZE, CRITERION_COMPILE_TIME, CRITERION_STREAM_STATE, CRITERION_SCRATCH_SIZE }; static bool higher_is_better(Criterion c) { return c == CRITERION_THROUGHPUT; } static void print_criterion(Criterion c, double val) { std::ios::fmtflags f(cout.flags()); switch (c) { case CRITERION_THROUGHPUT: cout << std::fixed << std::setprecision(3) << val << " Megabits/s"; break; case CRITERION_COMPILE_TIME: cout << std::fixed << std::setprecision(3) << val << " seconds"; break; case CRITERION_BYTECODE_SIZE: case CRITERION_STREAM_STATE: case CRITERION_SCRATCH_SIZE: default: cout << static_cast(val) << " bytes"; break; } cout.flags(f); } // Key for identifying a stream in our pcap input data, using data from its IP // headers. struct FiveTuple { unsigned int protocol; unsigned int srcAddr; unsigned int srcPort; unsigned int dstAddr; unsigned int dstPort; // Construct a FiveTuple from a TCP or UDP packet. FiveTuple(const struct ip *iphdr) { // IP fields protocol = iphdr->ip_p; srcAddr = iphdr->ip_src.s_addr; dstAddr = iphdr->ip_dst.s_addr; // UDP/TCP ports const struct udphdr *uh = (const struct udphdr *) (((const char *)iphdr) + (iphdr->ip_hl * 4)); srcPort = uh->uh_sport; dstPort = uh->uh_dport; } bool operator==(const FiveTuple &a) const { return protocol == a.protocol && srcAddr == a.srcAddr && srcPort == a.srcPort && dstAddr == a.dstAddr && dstPort == a.dstPort; } }; // A *very* simple hash function, used when we create an unordered_map of // FiveTuple objects. struct FiveTupleHash { size_t operator()(const FiveTuple &x) const { return x.srcAddr ^ x.dstAddr ^ x.protocol ^ x.srcPort ^ x.dstPort; } }; // Helper function. See end of file. static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset, unsigned int *length); // Match event handler: called every time Hyperscan finds a match. static int onMatch(unsigned int id, unsigned long long from, unsigned long long to, unsigned int flags, void *ctx) { // Our context points to a size_t storing the match count size_t *matches = (size_t *)ctx; (*matches)++; return 0; // continue matching } // Simple timing class class Clock { public: void start() { time_start = std::chrono::system_clock::now(); } void stop() { time_end = std::chrono::system_clock::now(); } double seconds() const { std::chrono::duration delta = time_end - time_start; return delta.count(); } private: std::chrono::time_point time_start, time_end; }; // Class wrapping all state associated with the benchmark class Benchmark { private: // Packet data to be scanned vector packets; // Stream ID for each packet vector stream_ids; // Map used to construct stream_ids unordered_map stream_map; // Hyperscan compiled database hs_database_t *db = nullptr; // Hyperscan temporary scratch space hs_scratch_t *scratch = nullptr; // Vector of Hyperscan stream state vector streams; // Count of matches found while scanning size_t matchCount = 0; public: ~Benchmark() { hs_free_scratch(scratch); hs_free_database(db); } // Initialisation; after this call, Benchmark owns the database and will // ensure it is freed. void setDatabase(hs_database_t *hs_db) { hs_free_database(db); // Free previous database. db = hs_db; // (Re)allocate scratch to ensure that it is large enough to handle the // database. hs_error_t err = hs_alloc_scratch(db, &scratch); if (err != HS_SUCCESS) { cerr << "ERROR: could not allocate scratch space. Exiting." << endl; exit(-1); } } const hs_database_t *getDatabase() const { return db; } size_t getScratchSize() const { size_t scratch_size; hs_error_t err = hs_scratch_size(scratch, &scratch_size); if (err != HS_SUCCESS) { cerr << "ERROR: could not query scratch space size. Exiting." << endl; exit(-1); } return scratch_size; } // Read a set of streams from a pcap file bool readStreams(const char *pcapFile) { // Open PCAP file for input char errbuf[PCAP_ERRBUF_SIZE]; pcap_t *pcapHandle = pcap_open_offline(pcapFile, errbuf); if (pcapHandle == nullptr) { cerr << "ERROR: Unable to open pcap file \"" << pcapFile << "\": " << errbuf << endl; return false; } struct pcap_pkthdr pktHeader; const unsigned char *pktData; while ((pktData = pcap_next(pcapHandle, &pktHeader)) != nullptr) { unsigned int offset = 0, length = 0; if (!payloadOffset(pktData, &offset, &length)) { continue; } // Valid TCP or UDP packet const struct ip *iphdr = (const struct ip *)(pktData + sizeof(struct ether_header)); const char *payload = (const char *)pktData + offset; size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr), stream_map.size())).first->second; packets.push_back(string(payload, length)); stream_ids.push_back(id); } pcap_close(pcapHandle); return !packets.empty(); } // Return the number of bytes scanned size_t bytes() const { size_t sum = 0; for (const auto &packet : packets) { sum += packet.size(); } return sum; } // Return the number of matches found. size_t matches() const { return matchCount; } // Clear the number of matches found. void clearMatches() { matchCount = 0; } // Open a Hyperscan stream for each stream in stream_ids void openStreams() { streams.resize(stream_map.size()); for (auto &stream : streams) { hs_error_t err = hs_open_stream(db, 0, &stream); if (err != HS_SUCCESS) { cerr << "ERROR: Unable to open stream. Exiting." << endl; exit(-1); } } } // Close all open Hyperscan streams (potentially generating any // end-anchored matches) void closeStreams() { for (auto &stream : streams) { hs_error_t err = hs_close_stream(stream, scratch, onMatch, &matchCount); if (err != HS_SUCCESS) { cerr << "ERROR: Unable to close stream. Exiting." << endl; exit(-1); } } } // Scan each packet (in the ordering given in the PCAP file) through // Hyperscan using the streaming interface. void scanStreams() { for (size_t i = 0; i != packets.size(); ++i) { const std::string &pkt = packets[i]; hs_error_t err = hs_scan_stream(streams[stream_ids[i]], pkt.c_str(), pkt.length(), 0, scratch, onMatch, &matchCount); if (err != HS_SUCCESS) { cerr << "ERROR: Unable to scan packet. Exiting." << endl; exit(-1); } } } // Scan each packet (in the ordering given in the PCAP file) through // Hyperscan using the block-mode interface. void scanBlock() { for (size_t i = 0; i != packets.size(); ++i) { const std::string &pkt = packets[i]; hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0, scratch, onMatch, &matchCount); if (err != HS_SUCCESS) { cerr << "ERROR: Unable to scan packet. Exiting." << endl; exit(-1); } } } }; // helper function - see end of file static void parseFile(const char *filename, vector &patterns, vector &flags, vector &ids, vector &originals); class Sigdata { vector flags; vector ids; vector patterns; vector originals; public: Sigdata() {} Sigdata(const char *filename) { parseFile(filename, patterns, flags, ids, originals); } const string &get_original(unsigned index) const { return originals[index]; } hs_database_t *compileDatabase(unsigned mode, double *compileTime) const { hs_database_t *db = nullptr; hs_compile_error_t *compileErr; // Turn our vector of strings into a vector of char*'s to pass in to // hs_compile_multi. (This is just using the vector of strings as // dynamic storage.) vector cstrPatterns; cstrPatterns.reserve(patterns.size()); for (const auto &pattern : patterns) { cstrPatterns.push_back(pattern.c_str()); } Clock clock; clock.start(); hs_error_t err = hs_compile_multi(cstrPatterns.data(), flags.data(), ids.data(), cstrPatterns.size(), mode, nullptr, &db, &compileErr); clock.stop(); if (err != HS_SUCCESS) { if (compileErr->expression < 0) { // The error does not refer to a particular expression. cerr << "ERROR: " << compileErr->message << endl; } else { cerr << "ERROR: Pattern '" << patterns[compileErr->expression] << "' failed with error '" << compileErr->message << "'" << endl; } // As the compileErr pointer points to dynamically allocated memory, // if we get an error, we must be sure to release it. This is not // necessary when no error is detected. hs_free_compile_error(compileErr); exit(-1); } *compileTime = clock.seconds(); return db; } unsigned size() const { return patterns.size(); } Sigdata cloneExclude(const set &excludeIndexSet) const { Sigdata c; for (unsigned i = 0, e = size(); i != e; ++i) { if (excludeIndexSet.find(i) == excludeIndexSet.end()) { c.flags.push_back(flags[i]); c.ids.push_back(ids[i]); c.patterns.push_back(patterns[i]); c.originals.push_back(originals[i]); } } return c; } }; static void usage(const char *) { cerr << "Usage:" << endl << endl; cerr << " patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl << " [ -F factor_group_size ] [ -N | -S ] " << " " << endl << endl << " -n repeats sets the number of times the PCAP is repeatedly " "scanned" << endl << " with the pattern." << endl << " -G generations sets the number of generations that the " "algorithm is" << endl << " run for." << endl << " -N sets non-streaming mode, -S sets streaming mode (default)." << endl << " -F sets the factor group size (must be >0); this " "allows the detection" << endl << " of multiple interacting factors." << endl << "" << endl << " -C sets the 'criterion', which can be either:" << endl << " t throughput (the default) - this requires a pcap file" << endl << " r scratch size" << endl << " s stream state size" << endl << " c compile time" << endl << " b bytecode size" << endl << endl << "We recommend the use of a utility like 'taskset' on " "multiprocessor hosts to" << endl << "lock execution to a single processor: this will remove processor " "migration" << endl << "by the scheduler as a source of noise in the results." << endl; } static double measure_stream_time(Benchmark &bench, unsigned int repeatCount) { Clock clock; bench.clearMatches(); clock.start(); for (unsigned int i = 0; i < repeatCount; i++) { bench.openStreams(); bench.scanStreams(); bench.closeStreams(); } clock.stop(); double secsScan = clock.seconds(); return secsScan; } static double measure_block_time(Benchmark &bench, unsigned int repeatCount) { Clock clock; bench.clearMatches(); clock.start(); for (unsigned int i = 0; i < repeatCount; i++) { bench.scanBlock(); } clock.stop(); double secsScan = clock.seconds(); return secsScan; } static double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode, unsigned repeatCount, Criterion criterion, bool diagnose = true) { double compileTime = 0; bench.setDatabase(sigs.compileDatabase(mode, &compileTime)); switch (criterion) { case CRITERION_BYTECODE_SIZE: { size_t dbSize; hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize); if (err != HS_SUCCESS) { cerr << "ERROR: could not retrieve bytecode size" << endl; exit(1); } return dbSize; } case CRITERION_COMPILE_TIME: return compileTime; case CRITERION_STREAM_STATE: { size_t streamStateSize; hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize); if (err != HS_SUCCESS) { cerr << "ERROR: could not retrieve stream state size" << endl; exit(1); } return streamStateSize; } case CRITERION_SCRATCH_SIZE: return bench.getScratchSize(); case CRITERION_THROUGHPUT: default: break; // do nothing - we are THROUGHPUT } double scan_time; if (mode == HS_MODE_NOSTREAM) { scan_time = measure_block_time(bench, repeatCount); } else { scan_time = measure_stream_time(bench, repeatCount); } size_t bytes = bench.bytes(); size_t matches = bench.matches(); if (diagnose) { std::ios::fmtflags f(cout.flags()); cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput " << std::fixed << std::setprecision(3) << (bytes * 8 * repeatCount) / (scan_time * 1000000) << " Mbps, Matches " << matches << endl; cout.flags(f); } return (bytes * 8 * repeatCount) / (scan_time * 1000000); } // Main entry point. int main(int argc, char **argv) { unsigned int repeatCount = 1; unsigned int mode = HS_MODE_STREAM; Criterion criterion = CRITERION_THROUGHPUT; unsigned int gen_max = 10; unsigned int factor_max = 1; // Process command line arguments. int opt; while ((opt = getopt(argc, argv, "SNn:G:F:C:")) != -1) { switch (opt) { case 'F': factor_max = atoi(optarg); break; case 'G': gen_max = atoi(optarg); break; case 'S': mode = HS_MODE_STREAM; break; case 'N': mode = HS_MODE_NOSTREAM; break; case 'C': switch (optarg[0]) { case 't': criterion = CRITERION_THROUGHPUT; break; case 'b': criterion = CRITERION_BYTECODE_SIZE; break; case 'c': criterion = CRITERION_COMPILE_TIME; break; case 's': criterion = CRITERION_STREAM_STATE; break; case 'r': criterion = CRITERION_SCRATCH_SIZE; break; default: cerr << "Unrecognised criterion: " << optarg[0] << endl; usage(argv[0]); exit(-1); } break; case 'n': repeatCount = atoi(optarg); break; default: usage(argv[0]); exit(-1); } } if (argc - optind != ((criterion == CRITERION_THROUGHPUT) ? 2 : 1)) { usage(argv[0]); exit(-1); } const char *patternFile = argv[optind]; const char *pcapFile = argv[optind + 1]; // Read our input PCAP file in Benchmark bench; if (criterion == CRITERION_THROUGHPUT) { if (!bench.readStreams(pcapFile)) { cerr << "Unable to read packets from PCAP file. Exiting." << endl; exit(-1); } } if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) { cerr << "Cannot evaluate stream state for block mode compile. Exiting." << endl; exit(-1); } cout << "Base signatures: " << patternFile; if (pcapFile) { cout << "\tPCAP input file: " << pcapFile << "\tRepeat count: " << repeatCount; } if (mode == HS_MODE_STREAM) { cout << "\tMode: streaming"; } else { cout << "\tMode: block"; } cout << endl; Sigdata sigs(patternFile); // calculate and show a baseline eval_set(bench, sigs, mode, repeatCount, criterion); set work_sigs, exclude; for (unsigned i = 0; i < sigs.size(); ++i) { work_sigs.insert(i); } double score_base = eval_set(bench, sigs, mode, repeatCount, criterion, false); bool maximize = higher_is_better(criterion); cout << "Number of signatures: " << sigs.size() << endl; cout << "Base performance: "; print_criterion(criterion, score_base); cout << endl; unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max); cout << "Cutting signatures cumulatively for " << generations << " generations" << endl; for (unsigned gen = 0; gen < generations; ++gen) { cout << "Generation " << gen << " "; set s(work_sigs.begin(), work_sigs.end()); double best = maximize ? 0 : 1000000000000.0; unsigned count = 0; while (s.size() > factor_max) { count++; cout << "." << std::flush; vector sv(s.begin(), s.end()); random_device rng; mt19937 urng(rng()); shuffle(sv.begin(), sv.end(), urng); unsigned groups = factor_max + 1; for (unsigned current_group = 0; current_group < groups; current_group++) { unsigned sz = sv.size(); unsigned lo = (current_group * sz) / groups; unsigned hi = ((current_group + 1) * sz) / groups; set s_part1(sv.begin(), sv.begin() + lo); set s_part2(sv.begin() + hi, sv.end()); set s_tmp = s_part1; s_tmp.insert(s_part2.begin(), s_part2.end()); set tmp = s_tmp; tmp.insert(exclude.begin(), exclude.end()); Sigdata sigs_tmp = sigs.cloneExclude(tmp); double score = eval_set(bench, sigs_tmp, mode, repeatCount, criterion, false); if ((current_group == 0) || (!maximize ? (score < best) : (score > best))) { s = s_tmp; best = score; } } } for (unsigned i = count; i < 16; i++) { cout << " "; } std::ios::fmtflags out_f(cout.flags()); cout << "Performance: "; print_criterion(criterion, best); cout << " (" << std::fixed << std::setprecision(3) << (best / score_base) << "x) after cutting:" << endl; cout.flags(out_f); // s now has factor_max signatures for (const auto &found : s) { exclude.insert(found); work_sigs.erase(found); cout << sigs.get_original(found) << endl; } cout << endl; } return 0; } /** * Helper function to locate the offset of the first byte of the payload in the * given ethernet frame. Offset into the packet, and the length of the payload * are returned in the arguments @a offset and @a length. */ static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset, unsigned int *length) { const ip *iph = (const ip *)(pkt_data + sizeof(ether_header)); const tcphdr *th = nullptr; // Ignore packets that aren't IPv4 if (iph->ip_v != 4) { return false; } // Ignore fragmented packets. if (iph->ip_off & htons(IP_MF | IP_OFFMASK)) { return false; } // IP header length, and transport header length. unsigned int ihlen = iph->ip_hl * 4; unsigned int thlen = 0; switch (iph->ip_p) { case IPPROTO_TCP: th = (const tcphdr *)((const char *)iph + ihlen); thlen = th->th_off * 4; break; case IPPROTO_UDP: thlen = sizeof(udphdr); break; default: return false; } *offset = sizeof(ether_header) + ihlen + thlen; *length = sizeof(ether_header) + ntohs(iph->ip_len) - *offset; return *length != 0; } static unsigned parseFlags(const string &flagsStr) { unsigned flags = 0; for (const auto &c : flagsStr) { switch (c) { case 'i': flags |= HS_FLAG_CASELESS; break; case 'm': flags |= HS_FLAG_MULTILINE; break; case 's': flags |= HS_FLAG_DOTALL; break; case 'H': flags |= HS_FLAG_SINGLEMATCH; break; case 'V': flags |= HS_FLAG_ALLOWEMPTY; break; case '8': flags |= HS_FLAG_UTF8; break; case 'W': flags |= HS_FLAG_UCP; break; case '\r': // stray carriage-return break; default: cerr << "Unsupported flag \'" << c << "\'" << endl; exit(-1); } } return flags; } static void parseFile(const char *filename, vector &patterns, vector &flags, vector &ids, vector &originals) { ifstream inFile(filename); if (!inFile.good()) { cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl; exit(-1); } for (unsigned i = 1; !inFile.eof(); ++i) { string line; getline(inFile, line); // if line is empty, or a comment, we can skip it if (line.empty() || line[0] == '#') { continue; } // otherwise, it should be ID:PCRE, e.g. // 10001:/foobar/is size_t colonIdx = line.find_first_of(':'); if (colonIdx == string::npos) { cerr << "ERROR: Could not parse line " << i << endl; exit(-1); } // we should have an unsigned int as an ID, before the colon unsigned id = std::stoi(line.substr(0, colonIdx).c_str()); // rest of the expression is the PCRE const string expr(line.substr(colonIdx + 1)); size_t flagsStart = expr.find_last_of('/'); if (flagsStart == string::npos) { cerr << "ERROR: no trailing '/' char" << endl; exit(-1); } string pcre(expr.substr(1, flagsStart - 1)); string flagsStr(expr.substr(flagsStart + 1, expr.size() - flagsStart)); unsigned flag = parseFlags(flagsStr); originals.push_back(line); patterns.push_back(pcre); flags.push_back(flag); ids.push_back(id); } }