/*-
 * Public Domain 2014-present MongoDB, Inc.
 * Public Domain 2008-2014 WiredTiger, Inc.
 *
 * This is free and unencumbered software released into the public domain.
 *
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 * distribute this software, either in source code form or as a compiled
 * binary, for any purpose, commercial or non-commercial, and by any
 * means.
 *
 * In jurisdictions that recognize copyright laws, the author or authors
 * of this software dedicate any and all copyright interest in the
 * software to the public domain. We make this dedication for the benefit
 * of the public at large and to the detriment of our heirs and
 * successors. We intend this dedication to be an overt act of
 * relinquishment in perpetuity of all present and future rights to this
 * software under copyright law.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "test_util.h"

#include <sys/wait.h>
#include <signal.h>

static char home[1024]; /* Program working dir */

/*
 * Create three tables that we will write the same data to and verify that
 * all the types of usage have the expected data in them after a crash and
 * recovery.  We want:
 * 1. A table that is logged and is not involved in timestamps.  This table
 * simulates a user local table.
 * 2. A table that is logged and involved in timestamps.  This simulates
 * the oplog.
 * 3. A table that is not logged and involved in timestamps.  This simulates
 * a typical collection file.
 *
 * We also have most threads perform schema operations such as create/drop.
 *
 * We also create several files that are not WiredTiger tables.  The checkpoint
 * thread creates a file indicating that a checkpoint has completed.  The parent
 * process uses this to know when at least one checkpoint is done and it can
 * start the timer to abort.
 *
 * Each worker thread creates its own records file that records the data it
 * inserted and it records the timestamp that was used for that insertion.
 */
#define INVALID_KEY UINT64_MAX
#define MAX_CKPT_INVL 2 /* Maximum interval between checkpoints */
/* Set large, some slow I/O systems take tens of seconds to fsync. */
#define MAX_STARTUP 30 /* Seconds to start up and set stable */
#define MAX_TH 12
#define MAX_TIME 40
#define MAX_VAL 1024
#define MIN_TH 5
#define MIN_TIME 10
#define PREPARE_FREQ 5
#define PREPARE_YIELD (PREPARE_FREQ * 10)
#define RECORDS_FILE "records-%" PRIu32
#define STABLE_PERIOD 100

static const char *const uri = "table:wt";
static const char *const uri_local = "table:local";
static const char *const uri_oplog = "table:oplog";
static const char *const uri_collection = "table:collection";

static const char *const ckpt_file = "checkpoint_done";

static bool compat, inmem, stable_set, use_ts, use_txn;
static volatile uint64_t global_ts = 1;
static volatile uint64_t uid = 1;
typedef struct {
    uint64_t ts;
    const char *op;
} THREAD_TS;
static volatile THREAD_TS th_ts[MAX_TH];

#define ENV_CONFIG_COMPAT ",compatibility=(release=\"2.9\")"
#define ENV_CONFIG_DEF                                        \
    "create,"                                                 \
    "eviction_updates_trigger=95,eviction_updates_target=80," \
    "log=(archive=false,file_max=10M,enabled)"
#define ENV_CONFIG_TXNSYNC \
    ENV_CONFIG_DEF         \
    ",transaction_sync=(enabled,method=none)"
#define ENV_CONFIG_REC "log=(archive=false,recover=on)"

/*
 * A minimum width of 10, along with zero filling, means that all the keys sort according to their
 * integer value, making each thread's key space distinct.
 */
#define KEY_FORMAT ("%010" PRIu64)

typedef struct {
    uint64_t absent_key; /* Last absent key */
    uint64_t exist_key;  /* First existing key after miss */
    uint64_t first_key;  /* First key in range */
    uint64_t first_miss; /* First missing key */
    uint64_t last_key;   /* Last key in range */
} REPORT;

typedef struct {
    WT_CONNECTION *conn;
    uint64_t start;
    uint32_t info;
    const char *op;
} THREAD_DATA;

#define NOOP "noop"
#define BULK "bulk"
#define BULK_UNQ "bulk_unique"
#define CREATE "create"
#define CREATE_UNQ "create_unique"
#define CURSOR "cursor"
#define DROP "drop"
#define UPGRADE "upgrade"
#define VERIFY "verify"

static void sig_handler(int) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
static void usage(void) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
static void
usage(void)
{
    fprintf(stderr, "usage: %s [-h dir] [-T threads] [-t time] [-Cmvxz]\n", progname);
    exit(EXIT_FAILURE);
}

static const char *const config = NULL;

/*
 * subtest_error_handler --
 *     Error event handler.
 */
static int
subtest_error_handler(
  WT_EVENT_HANDLER *handler, WT_SESSION *session, int error, const char *message)
{
    (void)(handler);
    (void)(session);
    (void)(error);

    /* Filter out errors about bulk load usage - they are annoying */
    if (strstr(message, "bulk-load is only supported on newly") == NULL)
        fprintf(stderr, "%s", message);
    return (0);
}

static WT_EVENT_HANDLER event_handler = {
  subtest_error_handler, NULL, /* Message handler */
  NULL,                        /* Progress handler */
  NULL                         /* Close handler */
};

/*
 * The following are various schema-related functions to have some threads performing during the
 * test. The goal is to make sure that after a random abort, the database is left in a recoverable
 * state. Yield during the schema operations to increase chance of abort during them.
 *
 * TODO: Currently only verifies insert data, it would be ideal to modify the schema operations so
 * that we can verify the state of the schema too.
 */

static void
dump_ts(uint64_t nth)
{
    uint64_t i;

    for (i = 0; i < nth; ++i)
        fprintf(stderr, "THREAD %" PRIu64 ": ts: %" PRIu64 " op %s\n", i, th_ts[i].ts, th_ts[i].op);
}

/*
 * test_bulk --
 *     Test creating a bulk cursor.
 */
static void
test_bulk(THREAD_DATA *td)
{
    WT_CURSOR *c;
    WT_DECL_RET;
    WT_SESSION *session;
    bool create;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    create = false;
    if ((ret = session->create(session, uri, config)) != 0)
        if (ret != EEXIST && ret != EBUSY)
            testutil_die(ret, "session.create");

    if (ret == 0) {
        create = true;
        if ((ret = session->open_cursor(session, uri, NULL, "bulk", &c)) == 0) {
            __wt_yield();
            testutil_check(c->close(c));
        } else if (ret != ENOENT && ret != EBUSY && ret != EINVAL)
            testutil_die(ret, "session.open_cursor bulk");
    }

    if (use_txn) {
        /* If create fails, rollback else will commit.*/
        if (!create)
            ret = session->rollback_transaction(session, NULL);
        else
            ret = session->commit_transaction(session, NULL);

        if (ret == EINVAL) {
            fprintf(stderr, "BULK: EINVAL on %s. ABORT\n", create ? "commit" : "rollback");
            testutil_die(ret, "session.commit bulk");
        }
    }
    testutil_check(session->close(session, NULL));
}

/*
 * test_bulk_unique --
 *     Test creating a bulk cursor with a unique name.
 */
static void
test_bulk_unique(THREAD_DATA *td, int force)
{
    WT_CURSOR *c;
    WT_DECL_RET;
    WT_SESSION *session;
    uint64_t my_uid;
    char new_uri[64];

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    /* Generate a unique object name. */
    my_uid = __wt_atomic_addv64(&uid, 1);
    testutil_check(__wt_snprintf(new_uri, sizeof(new_uri), "%s.%" PRIu64, uri, my_uid));

    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    testutil_check(session->create(session, new_uri, config));

    __wt_yield();
    /*
     * Opening a bulk cursor may have raced with a forced checkpoint which created a checkpoint of
     * the empty file, and triggers an EINVAL.
     */
    if ((ret = session->open_cursor(session, new_uri, NULL, "bulk", &c)) == 0)
        testutil_check(c->close(c));
    else if (ret != EINVAL)
        testutil_die(ret, "session.open_cursor bulk unique: %s, new_uri");

    while ((ret = session->drop(session, new_uri, force ? "force" : NULL)) != 0)
        if (ret != EBUSY)
            testutil_die(ret, "session.drop: %s", new_uri);

    if (use_txn && (ret = session->commit_transaction(session, NULL)) != 0 && ret != EINVAL)
        testutil_die(ret, "session.commit bulk unique");
    testutil_check(session->close(session, NULL));
}

/*
 * test_cursor --
 *     Open a cursor on a data source.
 */
static void
test_cursor(THREAD_DATA *td)
{
    WT_CURSOR *cursor;
    WT_DECL_RET;
    WT_SESSION *session;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    if ((ret = session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) {
        if (ret != ENOENT && ret != EBUSY)
            testutil_die(ret, "session.open_cursor");
    } else {
        __wt_yield();
        testutil_check(cursor->close(cursor));
    }

    if (use_txn && (ret = session->commit_transaction(session, NULL)) != 0 && ret != EINVAL)
        testutil_die(ret, "session.commit cursor");
    testutil_check(session->close(session, NULL));
}

/*
 * test_create --
 *     Create a table.
 */
static void
test_create(THREAD_DATA *td)
{
    WT_DECL_RET;
    WT_SESSION *session;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    if ((ret = session->create(session, uri, config)) != 0)
        if (ret != EEXIST && ret != EBUSY)
            testutil_die(ret, "session.create");
    __wt_yield();
    if (use_txn && (ret = session->commit_transaction(session, NULL)) != 0 && ret != EINVAL)
        testutil_die(ret, "session.commit create");
    testutil_check(session->close(session, NULL));
}

/*
 * test_create_unique --
 *     Create a uniquely named table.
 */
static void
test_create_unique(THREAD_DATA *td, int force)
{
    WT_DECL_RET;
    WT_SESSION *session;
    uint64_t my_uid;
    char new_uri[64];

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    /* Generate a unique object name. */
    my_uid = __wt_atomic_addv64(&uid, 1);
    testutil_check(__wt_snprintf(new_uri, sizeof(new_uri), "%s.%" PRIu64, uri, my_uid));

    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    testutil_check(session->create(session, new_uri, config));
    if (use_txn && (ret = session->commit_transaction(session, NULL)) != 0 && ret != EINVAL)
        testutil_die(ret, "session.commit create unique");

    __wt_yield();
    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    while ((ret = session->drop(session, new_uri, force ? "force" : NULL)) != 0)
        if (ret != EBUSY)
            testutil_die(ret, "session.drop: %s", new_uri);
    if (use_txn && (ret = session->commit_transaction(session, NULL)) != 0 && ret != EINVAL)
        testutil_die(ret, "session.commit create unique");

    testutil_check(session->close(session, NULL));
}

/*
 * test_drop --
 *     Test dropping a table.
 */
static void
test_drop(THREAD_DATA *td, int force)
{
    WT_DECL_RET;
    WT_SESSION *session;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    if (use_txn)
        testutil_check(session->begin_transaction(session, NULL));
    if ((ret = session->drop(session, uri, force ? "force" : NULL)) != 0)
        if (ret != ENOENT && ret != EBUSY)
            testutil_die(ret, "session.drop");

    if (use_txn) {
        /*
         * As the operations are being performed concurrently, return value can be ENOENT or EBUSY
         * will set error to transaction opened by session. In these cases the transaction has to be
         * aborted.
         */
        if (ret != ENOENT && ret != EBUSY)
            ret = session->commit_transaction(session, NULL);
        else
            ret = session->rollback_transaction(session, NULL);
        if (ret == EINVAL)
            testutil_die(ret, "session.commit drop");
    }
    testutil_check(session->close(session, NULL));
}

/*
 * test_upgrade --
 *     Upgrade a tree.
 */
static void
test_upgrade(THREAD_DATA *td)
{
    WT_DECL_RET;
    WT_SESSION *session;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    if ((ret = session->upgrade(session, uri, NULL)) != 0)
        if (ret != ENOENT && ret != EBUSY)
            testutil_die(ret, "session.upgrade");

    testutil_check(session->close(session, NULL));
}

/*
 * test_verify --
 *     Verify a tree.
 */
static void
test_verify(THREAD_DATA *td)
{
    WT_DECL_RET;
    WT_SESSION *session;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));

    if ((ret = session->verify(session, uri, NULL)) != 0)
        if (ret != ENOENT && ret != EBUSY)
            testutil_die(ret, "session.verify");

    testutil_check(session->close(session, NULL));
}

/*
 * thread_ts_run --
 *     Runner function for a timestamp thread.
 */
static WT_THREAD_RET
thread_ts_run(void *arg)
{
    WT_SESSION *session;
    THREAD_DATA *td;
    uint64_t i, last_ts, oldest_ts, this_ts;
    char tscfg[64];

    td = (THREAD_DATA *)arg;
    last_ts = 0;

    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
    /*
     * Every N records we will record our stable timestamp into the stable table. That will define
     * our threshold where we expect to find records after recovery.
     */
    for (;;) {
        oldest_ts = UINT64_MAX;
        /*
         * For the timestamp thread, the info field contains the number of worker threads.
         */
        for (i = 0; i < td->info; ++i) {
            /*
             * We need to let all threads get started, so if we find any thread still with a zero
             * timestamp we go to sleep.
             */
            this_ts = th_ts[i].ts;
            if (this_ts == 0)
                goto ts_wait;
            else if (this_ts < oldest_ts)
                oldest_ts = this_ts;
        }

        if (oldest_ts != UINT64_MAX && oldest_ts - last_ts > STABLE_PERIOD) {
            /*
             * Set both the oldest and stable timestamp so that we don't need to maintain read
             * availability at older timestamps.
             */
            testutil_check(__wt_snprintf(tscfg, sizeof(tscfg),
              "oldest_timestamp=%" PRIx64 ",stable_timestamp=%" PRIx64, oldest_ts, oldest_ts));
            testutil_check(td->conn->set_timestamp(td->conn, tscfg));
            last_ts = oldest_ts;
            if (!stable_set) {
                stable_set = true;
                printf("SET STABLE: %" PRIx64 " %" PRIu64 "\n", oldest_ts, oldest_ts);
            }
        } else
ts_wait:
            __wt_sleep(0, 1000);
    }
    /* NOTREACHED */
}

/*
 * thread_ckpt_run --
 *     Runner function for the checkpoint thread.
 */
static WT_THREAD_RET
thread_ckpt_run(void *arg)
{
    struct timespec now, start;
    FILE *fp;
    WT_RAND_STATE rnd;
    WT_SESSION *session;
    THREAD_DATA *td;
    uint64_t ts;
    uint32_t sleep_time;
    int i;
    bool first_ckpt;

    __wt_random_init(&rnd);

    td = (THREAD_DATA *)arg;
    /*
     * Keep a separate file with the records we wrote for checking.
     */
    (void)unlink(ckpt_file);
    testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
    first_ckpt = true;
    ts = 0;
    /*
     * Keep writing checkpoints until killed by parent.
     */
    __wt_epoch(NULL, &start);
    for (i = 0;;) {
        sleep_time = __wt_random(&rnd) % MAX_CKPT_INVL;
        sleep(sleep_time);
        if (use_ts) {
            ts = global_ts;
            /*
             * If we're using timestamps wait for the stable timestamp to get set the first time.
             */
            if (!stable_set) {
                __wt_epoch(NULL, &now);
                if (WT_TIMEDIFF_SEC(now, start) >= 1)
                    printf("CKPT: !stable_set time %" PRIu64 "\n", WT_TIMEDIFF_SEC(now, start));
                if (WT_TIMEDIFF_SEC(now, start) > MAX_STARTUP) {
                    fprintf(
                      stderr, "After %d seconds stable still not set. Aborting.\n", MAX_STARTUP);
                    /*
                     * For the checkpoint thread the info contains the number of threads.
                     */
                    dump_ts(td->info);
                    abort();
                }
                continue;
            }
        }
        /*
         * Since this is the default, send in this string even if running without timestamps.
         */
        testutil_check(session->checkpoint(session, "use_timestamp=true"));
        printf("Checkpoint %d complete.  Minimum ts %" PRIu64 "\n", ++i, ts);
        fflush(stdout);
        /*
         * Create the checkpoint file so that the parent process knows at least one checkpoint has
         * finished and can start its timer. Start the timer for stable after the first checkpoint
         * completes because a slow I/O lag during the checkpoint can cause a false positive for a
         * timeout.
         */
        if (first_ckpt) {
            testutil_checksys((fp = fopen(ckpt_file, "w")) == NULL);
            first_ckpt = false;
            testutil_checksys(fclose(fp) != 0);
        }
    }
    /* NOTREACHED */
}

/*
 * thread_run --
 *     Runner function for the worker threads.
 */
static WT_THREAD_RET
thread_run(void *arg)
{
    FILE *fp;
    WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
    WT_ITEM data;
    WT_RAND_STATE rnd;
    WT_SESSION *oplog_session, *session;
    THREAD_DATA *td;
    uint64_t i, stable_ts;
    char cbuf[MAX_VAL], lbuf[MAX_VAL], obuf[MAX_VAL];
    char kname[64], tscfg[64];
    bool use_prep;

    __wt_random_init(&rnd);
    memset(cbuf, 0, sizeof(cbuf));
    memset(lbuf, 0, sizeof(lbuf));
    memset(obuf, 0, sizeof(obuf));
    memset(kname, 0, sizeof(kname));

    td = (THREAD_DATA *)arg;
    /*
     * Set up the separate file for checking.
     */
    testutil_check(__wt_snprintf(cbuf, sizeof(cbuf), RECORDS_FILE, td->info));
    (void)unlink(cbuf);
    testutil_checksys((fp = fopen(cbuf, "w")) == NULL);
    /*
     * Set to line buffering. But that is advisory only. We've seen cases where the result files end
     * up with partial lines.
     */
    __wt_stream_set_line_buffer(fp);

    /*
     * Have half the threads use prepared transactions if timestamps are in use.
     */
    use_prep = (use_ts && td->info % 2 == 0) ? true : false;
    /*
     * We may have two sessions so that the oplog session can have its own transaction in parallel
     * with the collection session for threads that are going to be using prepared transactions. We
     * need this because prepared transactions cannot have any operations that modify a table that
     * is logged. But we also want to test mixed logged and not-logged transactions.
     */
    testutil_check(td->conn->open_session(td->conn, NULL, "isolation=snapshot", &session));
    /*
     * Open a cursor to each table.
     */
    testutil_check(session->open_cursor(session, uri_collection, NULL, NULL, &cur_coll));
    testutil_check(session->open_cursor(session, uri_local, NULL, NULL, &cur_local));
    oplog_session = NULL;
    if (use_prep) {
        testutil_check(
          td->conn->open_session(td->conn, NULL, "isolation=snapshot", &oplog_session));
        testutil_check(session->open_cursor(oplog_session, uri_oplog, NULL, NULL, &cur_oplog));
    } else
        testutil_check(session->open_cursor(session, uri_oplog, NULL, NULL, &cur_oplog));

    /*
     * Write our portion of the key space until we're killed.
     */
    printf("Thread %" PRIu32 " starts at %" PRIu64 "\n", td->info, td->start);
    stable_ts = 0;
    for (i = td->start;; ++i) {
        /*
         * Allow some threads to skip schema operations so that they are generating sufficient dirty
         * data.
         */
        WT_PUBLISH(th_ts[td->info].op, NOOP);
        if (td->info != 0 && td->info != 1)
            /*
             * Do a schema operation about 50% of the time by having a case for only about half the
             * possible mod values.
             */
            switch (__wt_random(&rnd) % 20) {
            case 0:
                WT_PUBLISH(th_ts[td->info].op, BULK);
                test_bulk(td);
                break;
            case 1:
                WT_PUBLISH(th_ts[td->info].op, BULK_UNQ);
                test_bulk_unique(td, __wt_random(&rnd) & 1);
                break;
            case 2:
                WT_PUBLISH(th_ts[td->info].op, CREATE);
                test_create(td);
                break;
            case 3:
                WT_PUBLISH(th_ts[td->info].op, CREATE_UNQ);
                test_create_unique(td, __wt_random(&rnd) & 1);
                break;
            case 4:
                WT_PUBLISH(th_ts[td->info].op, CURSOR);
                test_cursor(td);
                break;
            case 5:
                WT_PUBLISH(th_ts[td->info].op, DROP);
                test_drop(td, __wt_random(&rnd) & 1);
                break;
            case 6:
                WT_PUBLISH(th_ts[td->info].op, UPGRADE);
                test_upgrade(td);
                break;
            case 7:
                WT_PUBLISH(th_ts[td->info].op, VERIFY);
                test_verify(td);
                break;
            }
        if (use_ts)
            stable_ts = __wt_atomic_addv64(&global_ts, 1);
        testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i));

        testutil_check(session->begin_transaction(session, NULL));
        if (use_prep)
            testutil_check(oplog_session->begin_transaction(oplog_session, NULL));
        cur_coll->set_key(cur_coll, kname);
        cur_local->set_key(cur_local, kname);
        cur_oplog->set_key(cur_oplog, kname);
        /*
         * Put an informative string into the value so that it can be viewed well in a binary dump.
         */
        testutil_check(__wt_snprintf(cbuf, sizeof(cbuf),
          "COLL: thread:%" PRIu32 " ts:%" PRIu64 " key: %" PRIu64, td->info, stable_ts, i));
        testutil_check(__wt_snprintf(lbuf, sizeof(lbuf),
          "LOCAL: thread:%" PRIu32 " ts:%" PRIu64 " key: %" PRIu64, td->info, stable_ts, i));
        testutil_check(__wt_snprintf(obuf, sizeof(obuf),
          "OPLOG: thread:%" PRIu32 " ts:%" PRIu64 " key: %" PRIu64, td->info, stable_ts, i));
        data.size = __wt_random(&rnd) % MAX_VAL;
        data.data = cbuf;
        cur_coll->set_value(cur_coll, &data);
        testutil_check(cur_coll->insert(cur_coll));
        data.size = __wt_random(&rnd) % MAX_VAL;
        data.data = obuf;
        cur_oplog->set_value(cur_oplog, &data);
        testutil_check(cur_oplog->insert(cur_oplog));
        if (use_ts) {
            /*
             * Run with prepare every once in a while. And also yield after prepare sometimes too.
             * This is only done on the regular session.
             */
            if (use_prep && i % PREPARE_FREQ == 0) {
                testutil_check(
                  __wt_snprintf(tscfg, sizeof(tscfg), "prepare_timestamp=%" PRIx64, stable_ts));
                testutil_check(session->prepare_transaction(session, tscfg));
                if (i % PREPARE_YIELD == 0)
                    __wt_yield();

                testutil_check(__wt_snprintf(tscfg, sizeof(tscfg),
                  "commit_timestamp=%" PRIx64 ",durable_timestamp=%" PRIx64, stable_ts, stable_ts));
            } else
                testutil_check(
                  __wt_snprintf(tscfg, sizeof(tscfg), "commit_timestamp=%" PRIx64, stable_ts));

            testutil_check(session->commit_transaction(session, tscfg));
            if (use_prep) {
                /*
                 * Durable timestamp should not be passed as oplog transaction is a non-prepared
                 * transaction.
                 */
                testutil_check(
                  __wt_snprintf(tscfg, sizeof(tscfg), "commit_timestamp=%" PRIx64, stable_ts));
                testutil_check(oplog_session->commit_transaction(oplog_session, tscfg));
            }
            /*
             * Update the thread's last-committed timestamp. Don't let the compiler re-order this
             * statement, if we were to race with the timestamp thread, it might see our thread
             * update before the commit.
             */
            WT_PUBLISH(th_ts[td->info].ts, stable_ts);
        } else {
            testutil_check(session->commit_transaction(session, NULL));
            if (use_prep)
                testutil_check(oplog_session->commit_transaction(oplog_session, NULL));
        }
        /*
         * Insert into the local table outside the timestamp txn.
         */
        data.size = __wt_random(&rnd) % MAX_VAL;
        data.data = lbuf;
        cur_local->set_value(cur_local, &data);
        testutil_check(cur_local->insert(cur_local));

        /*
         * Save the timestamp and key separately for checking later.
         */
        if (fprintf(fp, "%" PRIu64 " %" PRIu64 "\n", stable_ts, i) < 0)
            testutil_die(EIO, "fprintf");
    }
    /* NOTREACHED */
}

/*
 * Child process creates the database and table, and then creates worker threads to add data until
 * it is killed by the parent.
 */
static void run_workload(uint32_t) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
static void
run_workload(uint32_t nth)
{
    WT_CONNECTION *conn;
    WT_SESSION *session;
    THREAD_DATA *td;
    wt_thread_t *thr;
    uint32_t ckpt_id, i, ts_id;
    char envconf[512];

    thr = dcalloc(nth + 2, sizeof(*thr));
    td = dcalloc(nth + 2, sizeof(THREAD_DATA));
    stable_set = false;
    if (chdir(home) != 0)
        testutil_die(errno, "Child chdir: %s", home);
    if (inmem)
        strcpy(envconf, ENV_CONFIG_DEF);
    else
        strcpy(envconf, ENV_CONFIG_TXNSYNC);
    if (compat)
        strcat(envconf, ENV_CONFIG_COMPAT);

    testutil_check(wiredtiger_open(NULL, &event_handler, envconf, &conn));
    testutil_check(conn->open_session(conn, NULL, NULL, &session));
    /*
     * Create all the tables.
     */
    testutil_check(
      session->create(session, uri_collection, "key_format=S,value_format=u,log=(enabled=false)"));
    testutil_check(session->create(session, uri_local, "key_format=S,value_format=u"));
    testutil_check(session->create(session, uri_oplog, "key_format=S,value_format=u"));
    /*
     * Don't log the stable timestamp table so that we know what timestamp was stored at the
     * checkpoint.
     */
    testutil_check(session->close(session, NULL));

    /*
     * The checkpoint thread and the timestamp threads are added at the end.
     */
    ckpt_id = nth;
    td[ckpt_id].conn = conn;
    td[ckpt_id].info = nth;
    printf("Create checkpoint thread\n");
    testutil_check(__wt_thread_create(NULL, &thr[ckpt_id], thread_ckpt_run, &td[ckpt_id]));
    ts_id = nth + 1;
    if (use_ts) {
        td[ts_id].conn = conn;
        td[ts_id].info = nth;
        printf("Create timestamp thread\n");
        testutil_check(__wt_thread_create(NULL, &thr[ts_id], thread_ts_run, &td[ts_id]));
    }
    printf("Create %" PRIu32 " writer threads\n", nth);
    for (i = 0; i < nth; ++i) {
        td[i].conn = conn;
        td[i].start = WT_BILLION * (uint64_t)i;
        td[i].info = i;
        testutil_check(__wt_thread_create(NULL, &thr[i], thread_run, &td[i]));
    }
    /*
     * The threads never exit, so the child will just wait here until it is killed.
     */
    fflush(stdout);
    for (i = 0; i <= ts_id; ++i)
        testutil_check(__wt_thread_join(NULL, &thr[i]));
    /*
     * NOTREACHED
     */
    free(thr);
    free(td);
    exit(EXIT_SUCCESS);
}

extern int __wt_optind;
extern char *__wt_optarg;

/*
 * Initialize a report structure. Since zero is a valid key we cannot just clear it.
 */
static void
initialize_rep(REPORT *r)
{
    r->first_key = r->first_miss = INVALID_KEY;
    r->absent_key = r->exist_key = r->last_key = INVALID_KEY;
}

/*
 * Print out information if we detect missing records in the middle of the data of a report
 * structure.
 */
static void
print_missing(REPORT *r, const char *fname, const char *msg)
{
    if (r->exist_key != INVALID_KEY)
        printf("%s: %s error %" PRIu64 " absent records %" PRIu64 "-%" PRIu64 ". Then keys %" PRIu64
               "-%" PRIu64 " exist. Key range %" PRIu64 "-%" PRIu64 "\n",
          fname, msg, (r->exist_key - r->first_miss) - 1, r->first_miss, r->exist_key - 1,
          r->exist_key, r->last_key, r->first_key, r->last_key);
}

/*
 * Signal handler to catch if the child died unexpectedly.
 */
static void
sig_handler(int sig)
{
    pid_t pid;

    WT_UNUSED(sig);
    pid = wait(NULL);
    /*
     * The core file will indicate why the child exited. Choose EINVAL here.
     */
    testutil_die(EINVAL, "Child process %" PRIu64 " abnormally exited", (uint64_t)pid);
}

int
main(int argc, char *argv[])
{
    struct sigaction sa;
    struct stat sb;
    FILE *fp;
    REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH];
    WT_CONNECTION *conn;
    WT_CURSOR *cur_coll, *cur_local, *cur_oplog;
    WT_DECL_RET;
    WT_RAND_STATE rnd;
    WT_SESSION *session;
    pid_t pid;
    uint64_t absent_coll, absent_local, absent_oplog, count, key, last_key;
    uint64_t stable_fp, stable_val;
    uint32_t i, nth, timeout;
    int ch, status;
    char buf[512], statname[1024];
    char fname[64], kname[64];
    const char *working_dir;
    bool fatal, rand_th, rand_time, verify_only;

    (void)testutil_set_progname(argv);

    compat = inmem = false;
    use_ts = true;
    /*
     * Setting this to false forces us to use internal library code. Allow an override but default
     * to using that code.
     */
    use_txn = false;
    nth = MIN_TH;
    rand_th = rand_time = true;
    timeout = MIN_TIME;
    verify_only = false;
    working_dir = "WT_TEST.schema-abort";

    while ((ch = __wt_getopt(progname, argc, argv, "Ch:mT:t:vxz")) != EOF)
        switch (ch) {
        case 'C':
            compat = true;
            break;
        case 'h':
            working_dir = __wt_optarg;
            break;
        case 'm':
            inmem = true;
            break;
        case 'T':
            rand_th = false;
            nth = (uint32_t)atoi(__wt_optarg);
            break;
        case 't':
            rand_time = false;
            timeout = (uint32_t)atoi(__wt_optarg);
            break;
        case 'v':
            verify_only = true;
            break;
        case 'x':
            use_txn = true;
            break;
        case 'z':
            use_ts = false;
            break;
        default:
            usage();
        }
    argc -= __wt_optind;
    if (argc != 0)
        usage();

    testutil_work_dir_from_path(home, sizeof(home), working_dir);
    /*
     * If the user wants to verify they need to tell us how many threads there were so we can find
     * the old record files.
     */
    if (verify_only && rand_th) {
        fprintf(stderr, "Verify option requires specifying number of threads\n");
        exit(EXIT_FAILURE);
    }
    if (!verify_only) {
        testutil_make_work_dir(home);

        __wt_random_init_seed(NULL, &rnd);
        if (rand_time) {
            timeout = __wt_random(&rnd) % MAX_TIME;
            if (timeout < MIN_TIME)
                timeout = MIN_TIME;
        }
        if (rand_th) {
            nth = __wt_random(&rnd) % MAX_TH;
            if (nth < MIN_TH)
                nth = MIN_TH;
        }

        printf("Parent: compatibility: %s, in-mem log sync: %s, timestamp in use: %s\n",
          compat ? "true" : "false", inmem ? "true" : "false", use_ts ? "true" : "false");
        printf("Parent: Create %" PRIu32 " threads; sleep %" PRIu32 " seconds\n", nth, timeout);
        printf("CONFIG: %s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname,
          compat ? " -C" : "", inmem ? " -m" : "", !use_ts ? " -z" : "", working_dir, nth, timeout);
        /*
         * Fork a child to insert as many items. We will then randomly kill the child, run recovery
         * and make sure all items we wrote exist after recovery runs.
         */
        memset(&sa, 0, sizeof(sa));
        sa.sa_handler = sig_handler;
        testutil_checksys(sigaction(SIGCHLD, &sa, NULL));
        testutil_checksys((pid = fork()) < 0);

        if (pid == 0) { /* child */
            run_workload(nth);
            return (EXIT_SUCCESS);
        }

        /* parent */
        /*
         * Sleep for the configured amount of time before killing the child. Start the timeout from
         * the time we notice that the file has been created. That allows the test to run correctly
         * on really slow machines.
         */
        testutil_check(__wt_snprintf(statname, sizeof(statname), "%s/%s", home, ckpt_file));
        while (stat(statname, &sb) != 0)
            testutil_sleep_wait(1, pid);
        sleep(timeout);
        sa.sa_handler = SIG_DFL;
        testutil_checksys(sigaction(SIGCHLD, &sa, NULL));

        /*
         * !!! It should be plenty long enough to make sure more than
         * one log file exists.  If wanted, that check would be added
         * here.
         */
        printf("Kill child\n");
        testutil_checksys(kill(pid, SIGKILL) != 0);
        testutil_checksys(waitpid(pid, &status, 0) == -1);
    }
    /*
     * !!! If we wanted to take a copy of the directory before recovery,
     * this is the place to do it. Don't do it all the time because
     * it can use a lot of disk space, which can cause test machine
     * issues.
     */
    if (chdir(home) != 0)
        testutil_die(errno, "parent chdir: %s", home);

    /* Copy the data to a separate folder for debugging purpose. */
    testutil_copy_data(home);
    printf("Open database, run recovery and verify content\n");

    /*
     * Open the connection which forces recovery to be run.
     */
    testutil_check(wiredtiger_open(NULL, &event_handler, ENV_CONFIG_REC, &conn));
    testutil_check(conn->open_session(conn, NULL, NULL, &session));
    /*
     * Open a cursor on all the tables.
     */
    testutil_check(session->open_cursor(session, uri_collection, NULL, NULL, &cur_coll));
    testutil_check(session->open_cursor(session, uri_local, NULL, NULL, &cur_local));
    testutil_check(session->open_cursor(session, uri_oplog, NULL, NULL, &cur_oplog));

    /*
     * Find the biggest stable timestamp value that was saved.
     */
    stable_val = 0;
    if (use_ts) {
        testutil_check(conn->query_timestamp(conn, buf, "get=recovery"));
        sscanf(buf, "%" SCNx64, &stable_val);
        printf("Got stable_val %" PRIu64 "\n", stable_val);
    }

    count = 0;
    absent_coll = absent_local = absent_oplog = 0;
    fatal = false;
    for (i = 0; i < nth; ++i) {
        initialize_rep(&c_rep[i]);
        initialize_rep(&l_rep[i]);
        initialize_rep(&o_rep[i]);
        testutil_check(__wt_snprintf(fname, sizeof(fname), RECORDS_FILE, i));
        if ((fp = fopen(fname, "r")) == NULL)
            testutil_die(errno, "fopen: %s", fname);

        /*
         * For every key in the saved file, verify that the key exists in the table after recovery.
         * If we're doing in-memory log buffering we never expect a record missing in the middle,
         * but records may be missing at the end. If we did write-no-sync, we expect every key to
         * have been recovered.
         */
        for (last_key = INVALID_KEY;; ++count, last_key = key) {
            ret = fscanf(fp, "%" SCNu64 "%" SCNu64 "\n", &stable_fp, &key);
            if (last_key == INVALID_KEY) {
                c_rep[i].first_key = key;
                l_rep[i].first_key = key;
                o_rep[i].first_key = key;
            }
            if (ret != EOF && ret != 2) {
                /*
                 * If we find a partial line, consider it like an EOF.
                 */
                if (ret == 1 || ret == 0)
                    break;
                testutil_die(errno, "fscanf");
            }
            if (ret == EOF)
                break;
            /*
             * If we're unlucky, the last line may be a partially written key at the end that can
             * result in a false negative error for a missing record. Detect it.
             */
            if (last_key != INVALID_KEY && key != last_key + 1) {
                printf("%s: Ignore partial record %" PRIu64 " last valid key %" PRIu64 "\n", fname,
                  key, last_key);
                break;
            }
            testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key));
            cur_coll->set_key(cur_coll, kname);
            cur_local->set_key(cur_local, kname);
            cur_oplog->set_key(cur_oplog, kname);
            /*
             * The collection table should always only have the data as of the checkpoint.
             */
            if ((ret = cur_coll->search(cur_coll)) != 0) {
                if (ret != WT_NOTFOUND)
                    testutil_die(ret, "search");
                /*
                 * If we don't find a record, the stable timestamp written to our file better be
                 * larger than the saved one.
                 */
                if (!inmem && stable_fp != 0 && stable_fp <= stable_val) {
                    printf("%s: COLLECTION no record with key %" PRIu64 " record ts %" PRIu64
                           " <= stable ts %" PRIu64 "\n",
                      fname, key, stable_fp, stable_val);
                    absent_coll++;
                }
                if (c_rep[i].first_miss == INVALID_KEY)
                    c_rep[i].first_miss = key;
                c_rep[i].absent_key = key;
            } else if (c_rep[i].absent_key != INVALID_KEY && c_rep[i].exist_key == INVALID_KEY) {
                /*
                 * If we get here we found a record that exists after absent records, a hole in our
                 * data.
                 */
                c_rep[i].exist_key = key;
                fatal = true;
            } else if (!inmem && stable_fp != 0 && stable_fp > stable_val) {
                /*
                 * If we found a record, the stable timestamp written to our file better be no
                 * larger than the checkpoint one.
                 */
                printf("%s: COLLECTION record with key %" PRIu64 " record ts %" PRIu64
                       " > stable ts %" PRIu64 "\n",
                  fname, key, stable_fp, stable_val);
                fatal = true;
            }
            /*
             * The local table should always have all data.
             */
            if ((ret = cur_local->search(cur_local)) != 0) {
                if (ret != WT_NOTFOUND)
                    testutil_die(ret, "search");
                if (!inmem)
                    printf("%s: LOCAL no record with key %" PRIu64 "\n", fname, key);
                absent_local++;
                if (l_rep[i].first_miss == INVALID_KEY)
                    l_rep[i].first_miss = key;
                l_rep[i].absent_key = key;
            } else if (l_rep[i].absent_key != INVALID_KEY && l_rep[i].exist_key == INVALID_KEY) {
                /*
                 * We should never find an existing key after we have detected one missing.
                 */
                l_rep[i].exist_key = key;
                fatal = true;
            }
            /*
             * The oplog table should always have all data.
             */
            if ((ret = cur_oplog->search(cur_oplog)) != 0) {
                if (ret != WT_NOTFOUND)
                    testutil_die(ret, "search");
                if (!inmem)
                    printf("%s: OPLOG no record with key %" PRIu64 "\n", fname, key);
                absent_oplog++;
                if (o_rep[i].first_miss == INVALID_KEY)
                    o_rep[i].first_miss = key;
                o_rep[i].absent_key = key;
            } else if (o_rep[i].absent_key != INVALID_KEY && o_rep[i].exist_key == INVALID_KEY) {
                /*
                 * We should never find an existing key after we have detected one missing.
                 */
                o_rep[i].exist_key = key;
                fatal = true;
            }
        }
        c_rep[i].last_key = last_key;
        l_rep[i].last_key = last_key;
        o_rep[i].last_key = last_key;
        testutil_checksys(fclose(fp) != 0);
        print_missing(&c_rep[i], fname, "COLLECTION");
        print_missing(&l_rep[i], fname, "LOCAL");
        print_missing(&o_rep[i], fname, "OPLOG");
    }
    testutil_check(conn->close(conn, NULL));
    if (!inmem && absent_coll) {
        printf("COLLECTION: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_coll, count);
        fatal = true;
    }
    if (!inmem && absent_local) {
        printf("LOCAL: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_local, count);
        fatal = true;
    }
    if (!inmem && absent_oplog) {
        printf("OPLOG: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_oplog, count);
        fatal = true;
    }
    if (fatal)
        return (EXIT_FAILURE);
    printf("%" PRIu64 " records verified\n", count);
    return (EXIT_SUCCESS);
}