import os import pyorc data = { "float_nullable": [1.0, 2.0, None, 4.0, 5.0], "float_required": [1.0, 2.0, 3.0, 4.0, 5.0], "bool_nullable": [True, False, None, True, False], "bool_required": [True, False, True, True, False], "int_nullable": [5, -5, None, 5, 5], "int_required": [5, -5, 1, 5, 5], "double_nullable": [1.0, 2.0, None, 4.0, 5.0], "double_required": [1.0, 2.0, 3.0, 4.0, 5.0], "bigint_nullable": [5, -5, None, 5, 5], "bigint_required": [5, -5, 1, 5, 5], "utf8_required": ["a", "bb", "ccc", "dddd", "eeeee"], "utf8_nullable": ["a", "bb", None, "dddd", "eeeee"], } def infer_schema(data): schema = "struct<" for key, value in data.items(): dt = type(value[0]) if dt == float: dt = "float" elif dt == int: dt = "int" elif dt == bool: dt = "boolean" elif dt == str: dt = "string" else: raise NotImplementedError if key.startswith("double"): dt = "double" if key.startswith("bigint"): dt = "bigint" schema += key + ":" + dt + "," schema = schema[:-1] + ">" return schema def _write( data, file_name: str, compression=pyorc.CompressionKind.NONE, dict_key_size_threshold=0.0, ): schema = infer_schema(data) output = open(file_name, "wb") writer = pyorc.Writer( output, schema, dict_key_size_threshold=dict_key_size_threshold, compression=compression, ) num_rows = len(list(data.values())[0]) for x in range(num_rows): row = tuple(values[x] for values in data.values()) writer.write(row) writer.close() os.makedirs("fixtures/pyorc", exist_ok=True) _write(data, "fixtures/pyorc/test.orc")