#!/usr/bin/env python3

import math
import random
import string

from argparse import ArgumentParser


TOKEN_LENGTH = 9


CONFIG = {
    "table1": {
        "prefix": {
            "allow_empty": False,
            "datatype": "prefix",
            "structure": {
                "type": "primary",
            },
        },
        "base": {
            "allow_empty": False,
            "datatype": "IRI",
            "structure": {
                "type": "unique",
            },
        },
        "ontology IRI": {
            "allow_empty": True,
            "datatype": "IRI",
        },
        "version IRI": {
            "allow_empty": True,
            "datatype": "IRI",
        },
    },
    "table2": {
        "child": {
            "allow_empty": False,
            "datatype": "trimmed_line",
            "structure": {
                "type": "foreign",
                "ftable": "table4",
                "fcolumn": "other_foreign_column",
            },
        },
        "parent": {
            "allow_empty": True,
            "datatype": "trimmed_line",
            "structure": {
                "type": "tree",
                "tcolumn": "child",
            },
        },
        "xyzzy": {
            "allow_empty": True,
            "datatype": "trimmed_line",
            "structure": {
                "type": "under",
                "ttable": "table2",
                "tcolumn": "child",
                "uval": "d",
            },
        },
        "foo": {
            "allow_empty": True,
            "datatype": "integer",
            "structure": {
                "type": "foreign",
                "ftable": "table4",
                "fcolumn": "numeric_foreign_column",
            },
        },
        "bar": {
            "allow_empty": True,
            "datatype": "text",
        },
    },
    "table3": {
        "source": {
            "allow_empty": False,
            "datatype": "prefix",
            "structure": {
                "type": "foreign",
                "ftable": "table1",
                "fcolumn": "prefix",
            },
        },
        "id": {
            "allow_empty": False,
            "datatype": "curie",
            "structure": {
                "type": "unique",
            },
        },
        "label": {
            "allow_empty": False,
            "datatype": "label",
            "structure": {
                "type": "primary",
            },
        },
        "parent": {
            "allow_empty": True,
            "datatype": "label",
            "structure": {
                "type": "tree",
                "tcolumn": "label",
            },
        },
        "related": {
            "allow_empty": True,
            "datatype": "trimmed_line",
        },
    },
    "table4": {
        "foreign_column": {
            "allow_empty": False,
            "datatype": "text",
            "structure": {
                "type": "unique",
            },
        },
        "other_foreign_column": {
            "allow_empty": False,
            "datatype": "text",
            "structure": {
                "type": "unique",
            },
        },
        "numeric_foreign_column": {
            "allow_empty": False,
            "datatype": "integer",
            "structure": {
                "type": "primary",
            },
        },
    },
    "table5": {
        "foo": {
            "allow_empty": False,
            "datatype": "word",
            "structure": {
                "type": "primary",
            },
        },
        "bar": {
            "allow_empty": False,
            "datatype": "integer",
        },
    },
    "table6": {
        "child": {
            "allow_empty": False,
            "datatype": "integer",
            "structure": {
                "type": "foreign",
                "ftable": "table4",
                "fcolumn": "numeric_foreign_column",
            },
        },
        "parent": {
            "allow_empty": True,
            "datatype": "integer",
            "structure": {
                "type": "tree",
                "tcolumn": "child",
            },
        },
        "xyzzy": {
            "allow_empty": True,
            "datatype": "integer",
            "structure": {
                "type": "under",
                "ttable": "table6",
                "tcolumn": "child",
                "uval": "4",
            },
        },
        "foo": {
            "allow_empty": True,
            "datatype": "text",
        },
        "bar": {
            "allow_empty": True,
            "datatype": "integer",
        },
    },
}


def get_value_from_prev_insert(prev_inserts, from_table, from_column, to_table, to_column):
    global CONFIG

    # Note: because we are loading the tables and columns in the correct order (i.e. such that
    # all dependencies are loaded before the tables and columns they depend on), the list of
    # previous inserts for the from_table/from_column will never be empty.
    if len(prev_inserts[from_table][from_column]) == 1:
        if CONFIG[to_table][to_column]["allow_empty"]:
            return ""
        else:
            return prev_inserts[from_table][from_column][0]
    else:
        # Select at random from the last 100 inserted values:
        prev_inserts[from_table][from_column] = prev_inserts[from_table][from_column][-100:]
        from_values = prev_inserts[from_table][from_column]
        # We'd ideally like to exclude the last inserted value from consideration, but we save it
        # here in case we cannot:
        last_val = from_values[len(from_values) - 1]
        from_values = from_values[0 : len(from_values) - 1]

        to_values = set(prev_inserts[to_table][to_column])
        values_to_choose_from = [f for f in from_values if f not in to_values]

        if not values_to_choose_from:
            return last_val
        else:
            return values_to_choose_from[random.randrange(len(values_to_choose_from))]


def get_constrained_cell_value(table, column, row_num, prev_inserts):
    global TOKEN_LENGTH
    global CONFIG

    structure = CONFIG[table][column].get("structure")
    if structure and structure["type"] == "foreign":
        ftable = structure["ftable"]
        fcolumn = structure["fcolumn"]
        cell = get_value_from_prev_insert(prev_inserts, ftable, fcolumn, table, column)
    elif structure and structure["type"] == "tree":
        tcolumn = structure["tcolumn"]
        cell = get_value_from_prev_insert(prev_inserts, table, tcolumn, table, column)
    elif structure and structure["type"] == "under":
        # Note that properly satisfying the under constraint requires, not only that
        # the cell is in the specified tree column, but also (a) that the tree
        # actually exists, and (b) that the value is "under" the under value. To do
        # this properly, though, would require a decent amount of memory. So perhaps
        # it's not worth it to check for (a) and (b) and allow any offending cells
        # to generate errors which we can then verify are handled properly by valve.
        ttable = structure["ttable"]
        tcolumn = structure["tcolumn"]
        cell = get_value_from_prev_insert(prev_inserts, ttable, tcolumn, table, column)
    elif CONFIG[table][column]["datatype"] in [
        "prefix",
        "IRI",
        "trimmed_line",
        "label",
        "word",
    ]:
        cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
    elif CONFIG[table][column]["datatype"] == "curie":
        cell = (
            "".join(random.choices(string.ascii_lowercase, k=3)).upper()
            + ":"
            + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
        )
    elif CONFIG[table][column]["datatype"] == "text":
        cell = (
            "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
            + " "
            + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
        )
    elif CONFIG[table][column]["datatype"] == "integer":
        # No leading 0s:
        cell = "".join(random.choices("123456789", k=1)) + "".join(
            random.choices(string.digits, k=TOKEN_LENGTH - 1)
        )
    else:
        print(
            f"Warning: Unknown datatype: {CONFIG[table][column]['datatype']}. "
            "Generating a random string."
        )
        cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))

    return cell


def main():
    global TOKEN_LENGTH
    global CONFIG

    parser = ArgumentParser(
        description="""
    Deterministically generate a specified amount of data, a specified percentage of which are
    errors, using a hard-coded VALVE configuration, given the specified seed, to a specified
    output directory.
    """
    )
    parser.add_argument("seed", help="The seed to use to generate the random data")
    parser.add_argument("num_rows", help="The number of rows per table to generate")
    parser.add_argument(
        "pct_errors", help="The percentage of rows in each table that should have errors"
    )
    parser.add_argument(
        "output_dir", help="The output directory to write the new table configuration to"
    )
    args = parser.parse_args()
    seed = int(args.seed)
    num_rows = int(args.num_rows)
    pct_errors = int(args.pct_errors)
    outdir = args.output_dir

    random.seed(seed)

    # This is a record of the last inserted values for each table and column. When one column
    # takes its values from another column, then we look here and fetch the last inserted value of
    # the second column.
    prev_inserts = {}
    tsv_files = {}
    tables_in_order = ["table4", "table1", "table2", "table3", "table5", "table6"]
    for table in tables_in_order:
        tsv_files[table] = open(f"{outdir}/{table}.tsv", "w")
        columns = [column for column in CONFIG[table]]
        print("\t".join(columns), file=tsv_files[table])

    num_error_rows = math.ceil((pct_errors / 100) * num_rows)
    error_proportion = None if not num_error_rows else math.floor(num_rows / num_error_rows)
    for row_num in range(1, num_rows + 1):
        for table in tables_in_order:
            is_error_row = error_proportion and row_num % error_proportion == 1
            columns = [column for column in CONFIG[table]]
            error_column = random.randrange(len(columns))
            row = {}
            for column_num, column in enumerate(columns):
                is_error_column = is_error_row and column_num == error_column
                if (
                    not is_error_column
                    and CONFIG[table][column]["allow_empty"]
                    and row_num % random.randrange(2, num_rows) == 1
                ):
                    # If the column allows empty values, assign an empty value "sometimes":
                    cell = ""
                elif not is_error_column:
                    cell = get_constrained_cell_value(table, column, row_num, prev_inserts)
                else:
                    if CONFIG[table][column].get("structure") and CONFIG[table][column][
                        "structure"
                    ]["type"] in ["unique", "primary"]:
                        cell = ""
                    elif CONFIG[table][column]["datatype"] in [
                        "prefix",
                        "IRI",
                        "word",
                        "curie",
                    ]:
                        cell = (
                            "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
                            + " "
                            + "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
                        )
                    else:
                        if CONFIG[table][column]["datatype"] == "integer":
                            cell = "".join(random.choices(string.ascii_lowercase, k=TOKEN_LENGTH))
                        else:
                            # No leading 0s:
                            cell = "".join(random.choices("123456789", k=1)) + "".join(
                                random.choices(string.digits, k=TOKEN_LENGTH - 1)
                            )

                row[column] = cell
                if not prev_inserts.get(table):
                    prev_inserts[table] = {}
                if not prev_inserts[table].get(column):
                    prev_inserts[table][column] = []
                prev_inserts[table][column].append(cell)

            row = "\t".join([row[column] for column in row])
            print(row, file=tsv_files[table])


if __name__ == "__main__":
    main()