/* * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** \file * \brief Pattern lifetime analysis. */ #include "config.h" #include "ng_find_matches.h" #include "nfagraph/ng_holder.h" #include "nfagraph/ng_util.h" #include "parser/position.h" #include "util/container.h" #include "util/compare.h" #include "util/report.h" #include "util/report_manager.h" #include "util/unordered.h" #include using namespace std; using namespace ue2; using MatchSet = set>; using StateBitSet = boost::dynamic_bitset<>; namespace { /** \brief Max number of states (taking edit distance into account). */ static constexpr size_t STATE_COUNT_MAX = 15000; // returns all successors up to a given depth in a vector of sets, indexed by // zero-based depth from source vertex static vector> gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) { assert(depth > 0); vector> result(depth); // populate current set of successors for (auto v : adjacent_vertices_range(src, g)) { // ignore self-loops if (src == v) { continue; } DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); result[0].insert(v); } for (u32 d = 1; d < depth; d++) { // collect all successors for all current level vertices const auto &cur = result[d - 1]; auto &next = result[d]; for (auto u : cur) { // don't go past special nodes if (is_special(u, g)) { continue; } for (auto v : adjacent_vertices_range(u, g)) { // ignore self-loops if (u == v) { continue; } DEBUG_PRINTF("Node %zu depth %u\n", g[v].index, d + 1); next.insert(v); } } } return result; } // returns all predecessors up to a given depth in a vector of sets, indexed by // zero-based depth from source vertex static vector> gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) { assert(depth > 0); vector> result(depth); // populate current set of successors for (auto v : inv_adjacent_vertices_range(src, g)) { // ignore self-loops if (src == v) { continue; } DEBUG_PRINTF("Node %zu depth 1\n", g[v].index); result[0].insert(v); } for (u32 d = 1; d < depth; d++) { // collect all successors for all current level vertices const auto &cur = result[d - 1]; auto &next = result[d]; for (auto v : cur) { for (auto u : inv_adjacent_vertices_range(v, g)) { // ignore self-loops if (v == u) { continue; } DEBUG_PRINTF("Node %zu depth %u\n", g[u].index, d + 1); next.insert(u); } } } return result; } // this is a per-vertex, per-shadow level state transition table struct GraphCache { GraphCache(u32 dist_in, u32 hamm_in, const NGHolder &g) : hamming(hamm_in > 0), size(num_vertices(g)), edit_distance(hamming ? hamm_in : dist_in) { auto dist_max = edit_distance + 1; allocateStateTransitionTable(dist_max); populateTransitionCache(g, dist_max); populateAcceptCache(g, dist_max); } void allocateStateTransitionTable(u32 dist_max) { // resize level 1 - per vertex shadow_transitions.resize(size); helper_transitions.resize(size); // resize level 2 - per shadow level for (u32 i = 0; i < size; i++) { shadow_transitions[i].resize(dist_max); helper_transitions[i].resize(dist_max); // resize level 3 - per vertex for (u32 d = 0; d < dist_max; d++) { shadow_transitions[i][d].resize(size); helper_transitions[i][d].resize(size); } } // accept states are indexed by edit distance accept_states.resize(dist_max); accept_eod_states.resize(dist_max); // vertex report maps are indexed by edit distance vertex_reports_by_level.resize(dist_max); vertex_eod_reports_by_level.resize(dist_max); } /* * certain transitions to helpers are disallowed: * 1. transitions from accept/acceptEod * 2. transitions to accept/acceptEod * 3. from start to startDs * 4. to a virtual/multiline start * * everything else is allowed. */ bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const { if (is_any_accept(u, g)) { return false; } if (is_any_accept(v, g)) { return false; } if (u == g.start && v == g.startDs) { return false; } if (is_virtual_start(v, g)) { return false; } return true; } void populateTransitionCache(const NGHolder &g, u32 dist_max) { // populate mapping of vertex index to vertex vector idx_to_v(size); for (auto v : vertices_range(g)) { idx_to_v[g[v].index] = v; } for (u32 i = 0; i < size; i++) { auto cur_v = idx_to_v[i]; // set up transition tables auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max); assert(succs.size() == dist_max); for (u32 d = 0; d < dist_max; d++) { auto &v_shadows = shadow_transitions[i][d]; auto cur_v_bit = i; // enable transition to next level helper (this handles insertion) if (!hamming && d < edit_distance && !is_any_accept(cur_v, g)) { auto &next_v_helpers = helper_transitions[i][d + 1]; next_v_helpers.set(cur_v_bit); } // if vertex has a self-loop, we can also transition to it, // but only if we're at shadow level 0 if (edge(cur_v, cur_v, g).second && d == 0) { v_shadows.set(cur_v_bit); } if (hamming && d > 0) { continue; } // populate state transition tables for (auto v : succs[d]) { auto v_bit = g[v].index; // we cannot transition to startDs on any level other than // level 0 if (v != g.startDs || d == 0) { // this handles direct transitions as well as removals v_shadows.set(v_bit); } // we can also transition to next-level helper (handles // replace), provided we meet the criteria if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) { auto &next_v_helpers = helper_transitions[i][d + 1]; next_v_helpers.set(v_bit); } } } } } void populateAcceptCache(const NGHolder &g, u32 dist_max) { // set up accept states masks StateBitSet accept(size); accept.set(g[g.accept].index); StateBitSet accept_eod(size); accept_eod.set(g[g.acceptEod].index); // gather accept and acceptEod states for (u32 base_dist = 0; base_dist < dist_max; base_dist++) { auto &states = accept_states[base_dist]; auto &eod_states = accept_eod_states[base_dist]; states.resize(size); eod_states.resize(size); // inspect each vertex for (u32 i = 0; i < size; i++) { // inspect all shadow levels from base_dist to dist_max for (u32 d = 0; d < dist_max - base_dist; d++) { auto &shadows = shadow_transitions[i][d]; // if this state transitions to accept, set its bit if ((shadows & accept).any()) { states.set(i); } if ((shadows & accept_eod).any()) { eod_states.set(i); } } } } // populate accepts cache for (auto v : inv_adjacent_vertices_range(g.accept, g)) { const auto &rs = g[v].reports; for (u32 d = 0; d <= edit_distance; d++) { // add self to report list at all levels vertex_reports_by_level[d][v].insert(rs.begin(), rs.end()); } if (edit_distance == 0 || hamming) { // if edit distance is 0, no predecessors will have reports continue; } auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { const auto &preds = preds_by_depth[pd]; // for each predecessor, add reports up to maximum edit distance // for current depth from source vertex for (auto pred : preds) { for (u32 d = 0; d < edit_distance - pd; d++) { vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end()); } } } } for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) { const auto &rs = g[v].reports; if (v == g.accept) { continue; } for (u32 d = 0; d <= edit_distance; d++) { // add self to report list at all levels vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end()); } if (edit_distance == 0 || hamming) { // if edit distance is 0, no predecessors will have reports continue; } auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance); for (u32 pd = 0; pd < preds_by_depth.size(); pd++) { const auto &preds = preds_by_depth[pd]; // for each predecessor, add reports up to maximum edit distance // for current depth from source vertex for (auto pred : preds) { for (u32 d = 0; d < edit_distance - pd; d++) { vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end()); } } } } } #ifdef DEBUG void dumpStateTransitionTable(const NGHolder &g) { StateBitSet accept(size); accept.set(g[g.accept].index); StateBitSet accept_eod(size); accept_eod.set(g[g.acceptEod].index); DEBUG_PRINTF("Dumping state transition tables\n"); DEBUG_PRINTF("Shadows:\n"); for (u32 i = 0; i < num_vertices(g); i++) { DEBUG_PRINTF("%-7s %3u:", "Vertex", i); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", j); } printf("\n"); for (u32 d = 0; d <= edit_distance; d++) { DEBUG_PRINTF("%-7s %3u:", "Level", d); const auto &s = getShadowTransitions(i, d); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", s.test(j)); } printf("\n"); } DEBUG_PRINTF("\n"); } DEBUG_PRINTF("Helpers:\n"); for (u32 i = 0; i < num_vertices(g); i++) { DEBUG_PRINTF("%-7s %3u:", "Vertex", i); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", j); } printf("\n"); for (u32 d = 0; d <= edit_distance; d++) { DEBUG_PRINTF("%-7s %3u:", "Level", d); const auto &s = getHelperTransitions(i, d); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", s.test(j)); } printf("\n"); } DEBUG_PRINTF("\n"); } DEBUG_PRINTF("Accept transitions:\n"); DEBUG_PRINTF("%-12s", "Vertex idx:"); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", j); } printf("\n"); for (u32 d = 0; d <= edit_distance; d++) { DEBUG_PRINTF("%-7s %3u:", "Level", d); const auto &s = getAcceptTransitions(d); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", s.test(j)); } printf("\n"); } DEBUG_PRINTF("\n"); DEBUG_PRINTF("Accept EOD transitions:\n"); DEBUG_PRINTF("%-12s", "Vertex idx:"); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", j); } printf("\n"); for (u32 d = 0; d <= edit_distance; d++) { DEBUG_PRINTF("%-7s %3u:", "Level", d); const auto &s = getAcceptEodTransitions(d); for (u32 j = 0; j < num_vertices(g); j++) { printf("%3i", s.test(j)); } printf("\n"); } DEBUG_PRINTF("\n"); DEBUG_PRINTF("%-12s ", "Accepts:"); for (u32 i = 0; i < num_vertices(g); i++) { printf("%3i", accept.test(i)); } printf("\n"); DEBUG_PRINTF("%-12s ", "EOD Accepts:"); for (u32 i = 0; i < num_vertices(g); i++) { printf("%3i", accept_eod.test(i)); } printf("\n"); DEBUG_PRINTF("Reports\n"); for (auto v : vertices_range(g)) { for (u32 d = 0; d <= edit_distance; d++) { const auto &r = vertex_reports_by_level[d][v]; const auto &e = vertex_eod_reports_by_level[d][v]; DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n", "Vertex", g[v].index, "rs:", r.size(), "eod:", e.size()); } } printf("\n"); } #endif const StateBitSet& getShadowTransitions(u32 idx, u32 level) const { assert(idx < size); assert(level <= edit_distance); return shadow_transitions[idx][level]; } const StateBitSet& getHelperTransitions(u32 idx, u32 level) const { assert(idx < size); assert(level <= edit_distance); return helper_transitions[idx][level]; } const StateBitSet& getAcceptTransitions(u32 level) const { assert(level <= edit_distance); return accept_states[level]; } const StateBitSet& getAcceptEodTransitions(u32 level) const { assert(level <= edit_distance); return accept_eod_states[level]; } /* * the bitsets are indexed by vertex and shadow level. the bitset's length is * equal to the total number of vertices in the graph. * * for convenience, helper functions are provided. */ vector> shadow_transitions; vector> helper_transitions; // accept states masks, indexed by shadow level vector accept_states; vector accept_eod_states; // map of all reports associated with any vertex, indexed by shadow level vector>> vertex_reports_by_level; vector>> vertex_eod_reports_by_level; bool hamming; u32 size; u32 edit_distance; }; /* * SOM workflow is expected to be the following: * - Caller calls getActiveStates, which reports SOM for each active states * - Caller calls getSuccessorStates on each of the active states, which *doesn't* * report SOM * - Caller decides if the successor state should be activated, and calls * activateState with SOM set to that of previous active state (not successor!) * - activateState then resolves any conflicts between SOMs that may arise from * multiple active states progressing to the same successor */ struct StateSet { struct State { enum node_type { NODE_SHADOW = 0, NODE_HELPER }; State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) : idx(idx_in), level(level_in), som(som_in), type(type_in) {} size_t idx; u32 level; size_t som; node_type type; }; // Temporary working data used for step() which we want to keep around // (rather than reallocating vectors all the time). struct WorkingData { vector active; vector succ_list; }; StateSet(size_t sz, u32 dist_in) : shadows(dist_in + 1), helpers(dist_in + 1), shadows_som(dist_in + 1), helpers_som(dist_in + 1), edit_distance(dist_in) { for (u32 dist = 0; dist <= dist_in; dist++) { shadows[dist].resize(sz, false); helpers[dist].resize(sz, false); shadows_som[dist].resize(sz, 0); helpers_som[dist].resize(sz, 0); } } void reset() { for (u32 dist = 0; dist <= edit_distance; dist++) { shadows[dist].reset(); helpers[dist].reset(); fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0); fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0); } } bool empty() const { for (u32 dist = 0; dist <= edit_distance; dist++) { if (shadows[dist].any()) { return false; } if (helpers[dist].any()) { return false; } } return true; } size_t count() const { size_t result = 0; for (u32 dist = 0; dist <= edit_distance; dist++) { result += shadows[dist].count(); result += helpers[dist].count(); } return result; } bool setActive(const State &s) { switch (s.type) { case State::NODE_HELPER: return helpers[s.level].test_set(s.idx); case State::NODE_SHADOW: return shadows[s.level].test_set(s.idx); } assert(0); return false; } size_t getCachedSom(const State &s) const { switch (s.type) { case State::NODE_HELPER: return helpers_som[s.level][s.idx]; case State::NODE_SHADOW: return shadows_som[s.level][s.idx]; } assert(0); return 0; } void setCachedSom(const State &s, const size_t som_val) { switch (s.type) { case State::NODE_HELPER: helpers_som[s.level][s.idx] = som_val; break; case State::NODE_SHADOW: shadows_som[s.level][s.idx] = som_val; break; default: assert(0); } } #ifdef DEBUG void dumpActiveStates() const { vector states; getActiveStates(states); DEBUG_PRINTF("Dumping active states\n"); for (const auto &state : states) { DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n", state.type == State::NODE_HELPER ? "HELPER" : "SHADOW", state.idx, state.level, state.som); } } #endif void getActiveStates(vector &result) const { result.clear(); for (u32 dist = 0; dist <= edit_distance; dist++) { // get all shadow vertices (including original graph) const auto &cur_shadow_vertices = shadows[dist]; for (size_t id = cur_shadow_vertices.find_first(); id != cur_shadow_vertices.npos; id = cur_shadow_vertices.find_next(id)) { result.emplace_back(id, dist, shadows_som[dist][id], State::NODE_SHADOW); } // the rest is only valid for edited graphs if (dist == 0) { continue; } // get all helper vertices const auto &cur_helper_vertices = helpers[dist]; for (size_t id = cur_helper_vertices.find_first(); id != cur_helper_vertices.npos; id = cur_helper_vertices.find_next(id)) { result.emplace_back(id, dist, helpers_som[dist][id], State::NODE_HELPER); } } sort_and_unique(result); } // does not return SOM void getSuccessors(const State &state, const GraphCache &gc, vector &result) const { result.clear(); // maximum shadow depth that we can go from current level u32 max_depth = edit_distance - state.level + 1; for (u32 d = 0; d < max_depth; d++) { const auto &shadow_succ = gc.getShadowTransitions(state.idx, d); for (size_t id = shadow_succ.find_first(); id != shadow_succ.npos; id = shadow_succ.find_next(id)) { auto new_level = state.level + d; result.emplace_back(id, new_level, 0, State::NODE_SHADOW); } const auto &helper_succ = gc.getHelperTransitions(state.idx, d); for (size_t id = helper_succ.find_first(); id != helper_succ.npos; id = helper_succ.find_next(id)) { auto new_level = state.level + d; result.emplace_back(id, new_level, 0, State::NODE_HELPER); } } sort_and_unique(result); } void getAcceptStates(const GraphCache &gc, vector &result) const { result.clear(); for (u32 dist = 0; dist <= edit_distance; dist++) { // get all shadow vertices (including original graph) auto cur_shadow_vertices = shadows[dist]; cur_shadow_vertices &= gc.getAcceptTransitions(dist); for (size_t id = cur_shadow_vertices.find_first(); id != cur_shadow_vertices.npos; id = cur_shadow_vertices.find_next(id)) { result.emplace_back(id, dist, shadows_som[dist][id], State::NODE_SHADOW); } auto cur_helper_vertices = helpers[dist]; cur_helper_vertices &= gc.getAcceptTransitions(dist); for (size_t id = cur_helper_vertices.find_first(); id != cur_helper_vertices.npos; id = cur_helper_vertices.find_next(id)) { result.emplace_back(id, dist, helpers_som[dist][id], State::NODE_HELPER); } } sort_and_unique(result); } void getAcceptEodStates(const GraphCache &gc, vector &result) const { result.clear(); for (u32 dist = 0; dist <= edit_distance; dist++) { // get all shadow vertices (including original graph) auto cur_shadow_vertices = shadows[dist]; cur_shadow_vertices &= gc.getAcceptEodTransitions(dist); for (size_t id = cur_shadow_vertices.find_first(); id != cur_shadow_vertices.npos; id = cur_shadow_vertices.find_next(id)) { result.emplace_back(id, dist, shadows_som[dist][id], State::NODE_SHADOW); } auto cur_helper_vertices = helpers[dist]; cur_helper_vertices &= gc.getAcceptEodTransitions(dist); for (size_t id = cur_helper_vertices.find_first(); id != cur_helper_vertices.npos; id = cur_helper_vertices.find_next(id)) { result.emplace_back(id, dist, helpers_som[dist][id], State::NODE_HELPER); } } sort_and_unique(result); } // the caller must specify SOM at current offset, and must not attempt to // resolve SOM inheritance conflicts void activateState(const State &state) { size_t cur_som = state.som; if (setActive(state)) { size_t cached_som = getCachedSom(state); cur_som = min(cur_som, cached_som); } setCachedSom(state, cur_som); } vector shadows; vector helpers; vector> shadows_som; vector> helpers_som; u32 edit_distance; }; // for flat_set bool operator<(const StateSet::State &a, const StateSet::State &b) { ORDER_CHECK(idx); ORDER_CHECK(level); ORDER_CHECK(type); ORDER_CHECK(som); return false; } bool operator==(const StateSet::State &a, const StateSet::State &b) { return a.idx == b.idx && a.level == b.level && a.type == b.type && a.som == b.som; } /** \brief Cache to speed up edge lookups, rather than hitting the graph. */ struct EdgeCache { explicit EdgeCache(const NGHolder &g) { cache.reserve(num_vertices(g)); for (auto e : edges_range(g)) { cache.emplace(make_pair(source(e, g), target(e, g)), e); } } NFAEdge get(NFAVertex u, NFAVertex v) const { auto it = cache.find(make_pair(u, v)); if (it != cache.end()) { return it->second; } return NFAEdge(); } private: ue2_unordered_map, NFAEdge> cache; }; struct fmstate { const size_t num_states; // number of vertices in graph StateSet states; // currently active states StateSet next; // states on after this iteration GraphCache &gc; vector vertices; // mapping from index to vertex EdgeCache edge_cache; size_t offset = 0; unsigned char cur = 0; unsigned char prev = 0; const bool utf8; const bool allowStartDs; const ReportManager &rm; fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in, const u32 edit_distance, const ReportManager &rm_in) : num_states(num_vertices(g)), states(num_states, edit_distance), next(num_states, edit_distance), gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()), edge_cache(g), utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) { // init states states.activateState( StateSet::State {g[g.start].index, 0, 0, StateSet::State::NODE_SHADOW}); if (allowStartDs) { states.activateState( StateSet::State {g[g.startDs].index, 0, 0, StateSet::State::NODE_SHADOW}); } // fill vertex mapping for (auto v : vertices_range(g)) { vertices[g[v].index] = v; } } }; } // namespace static bool isWordChar(const unsigned char c) { // check if it's an alpha character if (ourisalpha(c)) { return true; } // check if it's a digit if (c >= '0' && c <= '9') { return true; } // check if it's an underscore if (c == '_') { return true; } return false; } static bool isUtf8CodePoint(const char c) { // check if this is a start of 4-byte character if ((c & 0xF8) == 0xF0) { return true; } // check if this is a start of 3-byte character if ((c & 0xF0) == 0xE0) { return true; } // check if this is a start of 2-byte character if ((c & 0xE0) == 0xC0) { return true; } // check if this is a single-byte character if ((c & 0x80) == 0) { return true; } return false; } static bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) { auto flags = g[e].assert_flags; if (!flags) { return true; } if (flags & POS_FLAG_ASSERT_WORD_TO_NONWORD) { if (isWordChar(state.prev) && !isWordChar(state.cur)) { return true; } } if (flags & POS_FLAG_ASSERT_NONWORD_TO_WORD) { if (!isWordChar(state.prev) && isWordChar(state.cur)) { return true; } } if (flags & POS_FLAG_ASSERT_WORD_TO_WORD) { if (isWordChar(state.prev) && isWordChar(state.cur)) { return true; } } if (flags & POS_FLAG_ASSERT_NONWORD_TO_NONWORD) { if (!isWordChar(state.prev) && !isWordChar(state.cur)) { return true; } } return false; } static void getAcceptMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state, NFAVertex accept_vertex, vector &active_states) { assert(accept_vertex == g.accept || accept_vertex == g.acceptEod); const bool eod = accept_vertex == g.acceptEod; if (eod) { state.states.getAcceptEodStates(state.gc, active_states); } else { state.states.getAcceptStates(state.gc, active_states); } DEBUG_PRINTF("Number of active states: %zu\n", active_states.size()); for (const auto &cur : active_states) { auto u = state.vertices[cur.idx]; // we can't accept anything from startDs in between UTF-8 codepoints if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) { continue; } const auto &reports = eod ? state.gc.vertex_eod_reports_by_level[cur.level][u] : state.gc.vertex_reports_by_level[cur.level][u]; NFAEdge e = state.edge_cache.get(u, accept_vertex); // we assume edge assertions only exist at level 0 if (e && !canReach(g, e, state)) { continue; } DEBUG_PRINTF("%smatch found at %zu\n", eod ? "eod " : "", state.offset); assert(!reports.empty()); for (const auto &report_id : reports) { const Report &ri = state.rm.getReport(report_id); DEBUG_PRINTF("report %u has offset adjustment %d\n", report_id, ri.offsetAdjust); DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n", cur.idx, cur.level, cur.type, cur.som, state.offset + ri.offsetAdjust); matches.emplace(cur.som, state.offset + ri.offsetAdjust); } } } static void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state, StateSet::WorkingData &wd, bool allowEodMatches) { getAcceptMatches(g, matches, state, g.accept, wd.active); if (allowEodMatches) { getAcceptMatches(g, matches, state, g.acceptEod, wd.active); } } static void step(const NGHolder &g, fmstate &state, StateSet::WorkingData &wd) { state.next.reset(); state.states.getActiveStates(wd.active); for (const auto &cur : wd.active) { auto u = state.vertices[cur.idx]; state.states.getSuccessors(cur, state.gc, wd.succ_list); for (auto succ : wd.succ_list) { auto v = state.vertices[succ.idx]; if (is_any_accept(v, g)) { continue; } if (!state.allowStartDs && v == g.startDs) { continue; } // GraphCache doesn't differentiate between successors for shadows // and helpers, and StateSet does not know anything about the graph, // so the only place we can do it is here. we can't self-loop on a // startDs if we're startDs's helper, so disallow it. if (u == g.startDs && v == g.startDs && succ.level != 0 && succ.level == cur.level) { continue; } // for the reasons outlined above, also putting this here. // disallow transitions from start to startDs on levels other than zero if (u == g.start && v == g.startDs && cur.level != 0 && succ.level != 0) { continue; } bool can_reach = false; if (succ.type == StateSet::State::NODE_HELPER) { can_reach = true; } else { // we assume edge assertions only exist on level 0 const CharReach &cr = g[v].char_reach; NFAEdge e = state.edge_cache.get(u, v); if (cr.test(state.cur) && (!e || canReach(g, e, state))) { can_reach = true; } } // check edge assertions if we are allowed to reach accept DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n", g[u].index, g[v].index, ourisprint(state.prev) ? state.prev : '?', ourisprint(state.cur) ? state.cur : '?', can_reach ? "yes" : "no"); if (can_reach) { // we should use current offset as SOM if: // - we're at level 0 and we're a start vertex // - we're a fake start shadow size_t next_som; bool reset = is_any_start(u, g) && cur.level == 0; reset |= is_virtual_start(u, g) && cur.type == StateSet::State::NODE_SHADOW; if (reset) { next_som = state.offset; } else { // else, inherit SOM from predecessor next_som = cur.som; } succ.som = next_som; DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n", cur.idx, cur.level, cur.som, cur.type == StateSet::State::NODE_HELPER ? "H" : "S"); DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n", succ.idx, succ.level, succ.som, succ.type == StateSet::State::NODE_HELPER ? "H" : "S"); // activate successor (SOM will be handled by activateState) state.next.activateState(succ); } } } } // filter extraneous matches static void filterMatches(MatchSet &matches) { set eom; // first, collect all end-offset matches for (const auto &match : matches) { eom.insert(match.second); } // now, go through all the end-offsets and filter extra matches for (const auto &elem : eom) { // find minimum SOM for this EOM size_t min_som = -1U; for (const auto &match : matches) { // skip entries with wrong EOM if (match.second != elem) { continue; } min_som = min(min_som, match.first); } auto msit = matches.begin(); while (msit != matches.end()) { // skip everything that doesn't match if (msit->second != elem || msit->first <= min_som) { ++msit; continue; } DEBUG_PRINTF("erasing match %zu, %zu\n", msit->first, msit->second); matches.erase(msit++); } } } /** \brief Find all matches for a given graph when executed against \a input. * * Fills \a matches with offsets into the data stream where a match is found. */ bool findMatches(const NGHolder &g, const ReportManager &rm, const string &input, MatchSet &matches, const u32 edit_distance, const u32 hamm_distance, const bool notEod, const bool utf8) { assert(hasCorrectlyNumberedVertices(g)); // cannot match fuzzy utf8 patterns, this should've been filtered out at // compile time, so make it an assert assert(!edit_distance || !utf8); // cannot be both edit and Hamming distance at once assert(!edit_distance || !hamm_distance); bool hamming = hamm_distance > 0; auto dist = hamming ? hamm_distance : edit_distance; const size_t total_states = num_vertices(g) * (3 * dist + 1); DEBUG_PRINTF("Finding matches (%zu total states)\n", total_states); if (total_states > STATE_COUNT_MAX) { DEBUG_PRINTF("too big\n"); return false; } GraphCache gc(edit_distance, hamm_distance, g); #ifdef DEBUG gc.dumpStateTransitionTable(g); #endif const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0); struct fmstate state(g, gc, utf8, allowStartDs, dist, rm); StateSet::WorkingData wd; for (auto it = input.begin(), ite = input.end(); it != ite; ++it) { #ifdef DEBUG state.states.dumpActiveStates(); #endif state.offset = std::distance(input.begin(), it); state.cur = *it; step(g, state, wd); getMatches(g, matches, state, wd, false); DEBUG_PRINTF("offset %zu, %zu states on\n", state.offset, state.next.count()); if (state.next.empty()) { filterMatches(matches); return true; } state.states = state.next; state.prev = state.cur; } #ifdef DEBUG state.states.dumpActiveStates(); #endif state.offset = input.size(); state.cur = 0; // do additional step to get matches after stream end, this time count eod // matches also (or not, if we're in notEod mode) DEBUG_PRINTF("Looking for EOD matches\n"); getMatches(g, matches, state, wd, !notEod); filterMatches(matches); return true; }