# This script takes a Github access token and the name of an organization, and runs Choctaw_hog for each repo # It runs the scans in a multi-processing pool, and collects the results and writes them to output.csv # It is meant to be used with false_positives.py # It requires a single third party library: https://github.com/PyGithub/PyGithub from github import Github import subprocess from multiprocessing import Pool import json import tempfile import uuid import os import csv import sys g = Github(os.environ['GITHUB_ACCESS_TOKEN']) repos_to_scan = [] for repo in g.get_organization(sys.argv[1]).get_repos(type="all"): repos_to_scan.append(repo) print(f"Scanning {len(repos_to_scan)} repos...") tempdir = tempfile.gettempdir() def f(x): filename = os.path.join(tempdir, str(uuid.uuid4())) # expects choctaw_hog in your path s = subprocess.run(["choctaw_hog", "--outputfile", filename, "--regex", "trufflehog_rules.json", x.ssh_url], capture_output=True) return {"repo": x.name, "results": filename} output = [] with Pool(4) as p: # increase this number to the number of cores you have - runs great on a c5n.4xlarge with 14 output.extend(p.map(f, repos_to_scan)) print("Complete! Dumping output to output.csv...") with open('output.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Repository', 'reason', 'stringsFound', 'path', 'commit', 'commitHash', 'date']) for result in output: try: with open(result['results'], 'r') as f: result_list = json.load(f) for finding in result_list: writer.writerow([result['repo'], result['reason'], str(finding['stringsFound']), finding['path'], finding['commit'], finding['commitHash'], finding['date']]) except: pass print("Output written to output.csv")