#!/usr/bin/env python3 # Scan for any external dependencies that were last updated before known CVEs # (and near relatives). We also try a fuzzy match on version information. from collections import defaultdict, namedtuple import datetime as dt import gzip import json import re import sys import textwrap import urllib.request import utils as dep_utils # These CVEs are false positives for the match heuristics. An explanation is # required when adding a new entry to this list as a comment. IGNORES_CVES = set([ # Node.js issue unrelated to http-parser (napi_ API implementation). 'CVE-2020-8174', # Node.js HTTP desync attack. Request smuggling due to CR and hyphen # conflation in llhttp # (https://github.com/nodejs/llhttp/commit/9d9da1d0f18599ceddd8f484df5a5ad694d23361). # This was a result of using llparse's toLowerUnsafe() for header keys. # http-parser uses a TOKEN method that doesn't have the same issue for # header fields. 'CVE-2020-8201', # Node.js issue unrelated to http-parser. This is a DoS due to a lack of # request/connection timeouts, see # https://github.com/nodejs/node/commit/753f3b247a. 'CVE-2020-8251', # Node.js issue unrelated to http-parser (libuv). 'CVE-2020-8252', # Fixed via the nghttp2 1.41.0 bump in Envoy 8b6ea4. 'CVE-2020-11080', # Node.js issue rooted in a c-ares bug. Does not appear to affect # http-parser or our use of c-ares, c-ares has been bumped regardless. 'CVE-2020-8277', # gRPC issue that only affects Javascript bindings. 'CVE-2020-7768', # Node.js issue unrelated to http-parser, see # https://github.com/mhart/StringStream/issues/7. 'CVE-2018-21270', ]) # Subset of CVE fields that are useful below. Cve = namedtuple( 'Cve', ['id', 'description', 'cpes', 'score', 'severity', 'published_date', 'last_modified_date']) class Cpe(namedtuple('CPE', ['part', 'vendor', 'product', 'version'])): '''Model a subset of CPE fields that are used in CPE matching.''' @classmethod def FromString(cls, cpe_str): assert (cpe_str.startswith('cpe:2.3:')) components = cpe_str.split(':') assert (len(components) >= 6) return cls(*components[2:6]) def __str__(self): return f'cpe:2.3:{self.part}:{self.vendor}:{self.product}:{self.version}' def VendorNormalized(self): '''Return a normalized CPE where only part and vendor are significant.''' return Cpe(self.part, self.vendor, '*', '*') def ParseCveJson(cve_json, cves, cpe_revmap): '''Parse CVE JSON dictionary. Args: cve_json: a NIST CVE JSON dictionary. cves: dictionary mapping CVE ID string to Cve object (output). cpe_revmap: a reverse map from vendor normalized CPE to CVE ID string. ''' # This provides an over-approximation of possible CPEs affected by CVE nodes # metadata; it traverses the entire AND-OR tree and just gathers every CPE # observed. Generally we expect that most of Envoy's CVE-CPE matches to be # simple, plus it's interesting to consumers of this data to understand when a # CPE pops up, even in a conditional setting. def GatherCpes(nodes, cpe_set): for node in nodes: for cpe_match in node.get('cpe_match', []): cpe_set.add(Cpe.FromString(cpe_match['cpe23Uri'])) GatherCpes(node.get('children', []), cpe_set) for cve in cve_json['CVE_Items']: cve_id = cve['cve']['CVE_data_meta']['ID'] description = cve['cve']['description']['description_data'][0]['value'] cpe_set = set() GatherCpes(cve['configurations']['nodes'], cpe_set) if len(cpe_set) == 0: continue cvss_v3_score = cve['impact']['baseMetricV3']['cvssV3']['baseScore'] cvss_v3_severity = cve['impact']['baseMetricV3']['cvssV3']['baseSeverity'] def ParseCveDate(date_str): assert (date_str.endswith('Z')) return dt.date.fromisoformat(date_str.split('T')[0]) published_date = ParseCveDate(cve['publishedDate']) last_modified_date = ParseCveDate(cve['lastModifiedDate']) cves[cve_id] = Cve(cve_id, description, cpe_set, cvss_v3_score, cvss_v3_severity, published_date, last_modified_date) for cpe in cpe_set: cpe_revmap[str(cpe.VendorNormalized())].add(cve_id) return cves, cpe_revmap def DownloadCveData(urls): '''Download NIST CVE JSON databases from given URLs and parse. Args: urls: a list of URLs. Returns: cves: dictionary mapping CVE ID string to Cve object (output). cpe_revmap: a reverse map from vendor normalized CPE to CVE ID string. ''' cves = {} cpe_revmap = defaultdict(set) for url in urls: print(f'Loading NIST CVE database from {url}...') with urllib.request.urlopen(url) as request: with gzip.GzipFile(fileobj=request) as json_data: ParseCveJson(json.loads(json_data.read()), cves, cpe_revmap) return cves, cpe_revmap def FormatCveDetails(cve, deps): formatted_deps = ', '.join(sorted(deps)) wrapped_description = '\n '.join(textwrap.wrap(cve.description)) return f''' CVE ID: {cve.id} CVSS v3 score: {cve.score} Severity: {cve.severity} Published date: {cve.published_date} Last modified date: {cve.last_modified_date} Dependencies: {formatted_deps} Description: {wrapped_description} Affected CPEs: ''' + '\n '.join(f'- {cpe}' for cpe in cve.cpes) FUZZY_DATE_RE = re.compile('(\d{4}).?(\d{2}).?(\d{2})') FUZZY_SEMVER_RE = re.compile('(\d+)[:\.\-_](\d+)[:\.\-_](\d+)') def RegexGroupsMatch(regex, lhs, rhs): '''Do two strings match modulo a regular expression? Args: regex: regular expression lhs: LHS string rhs: RHS string Returns: A boolean indicating match. ''' lhs_match = regex.search(lhs) if lhs_match: rhs_match = regex.search(rhs) if rhs_match and lhs_match.groups() == rhs_match.groups(): return True return False def CpeMatch(cpe, dep_metadata): '''Heuristically match dependency metadata against CPE. We have a number of rules below that should are easy to compute without having to look at the dependency metadata. In the future, with additional access to repository information we could do the following: - For dependencies at a non-release version, walk back through git history to the last known release version and attempt a match with this. - For dependencies at a non-release version, use the commit date to look for a version match where version is YYYY-MM-DD. Args: cpe: Cpe object to match against. dep_metadata: dependency metadata dictionary. Returns: A boolean indicating a match. ''' dep_cpe = Cpe.FromString(dep_metadata['cpe']) dep_version = dep_metadata['version'] # The 'part' and 'vendor' must be an exact match. if cpe.part != dep_cpe.part: return False if cpe.vendor != dep_cpe.vendor: return False # We allow Envoy dependency CPEs to wildcard the 'product', this is useful for # LLVM where multiple product need to be covered. if dep_cpe.product != '*' and cpe.product != dep_cpe.product: return False # Wildcard versions always match. if cpe.version == '*': return True # An exact version match is a hit. if cpe.version == dep_version: return True # Allow the 'release_date' dependency metadata to substitute for date. # TODO(htuch): Consider fuzzier date ranges. if cpe.version == dep_metadata['release_date']: return True # Try a fuzzy date match to deal with versions like fips-20190304 in dependency version. if RegexGroupsMatch(FUZZY_DATE_RE, dep_version, cpe.version): return True # Try a fuzzy semver match to deal with things like 2.1.0-beta3. if RegexGroupsMatch(FUZZY_SEMVER_RE, dep_version, cpe.version): return True # Fall-thru. return False def CveMatch(cve, dep_metadata): '''Heuristically match dependency metadata against CVE. In general, we allow false positives but want to keep the noise low, to avoid the toil around having to populate IGNORES_CVES. Args: cve: Cve object to match against. dep_metadata: dependency metadata dictionary. Returns: A boolean indicating a match. ''' wildcard_version_match = False # Consider each CPE attached to the CVE for a match against the dependency CPE. for cpe in cve.cpes: if CpeMatch(cpe, dep_metadata): # Wildcard version matches need additional heuristics unrelated to CPE to # qualify, e.g. last updated date. if cpe.version == '*': wildcard_version_match = True else: return True if wildcard_version_match: # If the CVE was published after the dependency was last updated, it's a # potential match. last_dep_update = dt.date.fromisoformat(dep_metadata['release_date']) if last_dep_update <= cve.published_date: return True return False def CveScan(cves, cpe_revmap, cve_allowlist, repository_locations): '''Scan for CVEs in a parsed NIST CVE database. Args: cves: CVE dictionary as provided by DownloadCveData(). cve_revmap: CPE-CVE reverse map as provided by DownloadCveData(). cve_allowlist: an allowlist of CVE IDs to ignore. repository_locations: a dictionary of dependency metadata in the format described in api/bazel/external_deps.bzl. Returns: possible_cves: a dictionary mapping CVE IDs to Cve objects. cve_deps: a dictionary mapping CVE IDs to dependency names. ''' possible_cves = {} cve_deps = defaultdict(list) for dep, metadata in repository_locations.items(): cpe = metadata.get('cpe', 'N/A') if cpe == 'N/A': continue candidate_cve_ids = cpe_revmap.get(str(Cpe.FromString(cpe).VendorNormalized()), []) for cve_id in candidate_cve_ids: cve = cves[cve_id] if cve.id in cve_allowlist: continue if CveMatch(cve, metadata): possible_cves[cve_id] = cve cve_deps[cve_id].append(dep) return possible_cves, cve_deps if __name__ == '__main__': # Allow local overrides for NIST CVE database URLs via args. urls = sys.argv[1:] if not urls: # We only look back a few years, since we shouldn't have any ancient deps. current_year = dt.datetime.now().year scan_years = range(2018, current_year + 1) urls = [ f'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{year}.json.gz' for year in scan_years ] cves, cpe_revmap = DownloadCveData(urls) possible_cves, cve_deps = CveScan(cves, cpe_revmap, IGNORES_CVES, dep_utils.RepositoryLocations()) if possible_cves: print('\nBased on heuristic matching with the NIST CVE database, Envoy may be vulnerable to:') for cve_id in sorted(possible_cves): print(f'{FormatCveDetails(possible_cves[cve_id], cve_deps[cve_id])}') sys.exit(1)