Parquet Lua Module

1. Overview

Lua wrapper for the parquet-cpp library allowing for Parquet file output.

2. Module

2.1. Example Usage

require "parquet"

local r1 = {
    DocId = 10,
    Links = {Forward = {20, 40, 60}},
    Name = {
        {
            Language = {
                {Code = "en-us", Country = "us"},
                {Code = "en"}
            },
            Url = "http://A"
        },
        {
            Url = "http://B"
        },
        {
            Language = {Code = "en-gb", Country = "gb"}
        }
    }
}

local r2 = {
    DocId = 20,
    Links = {Backward = {10, 30}, Forward = 80},
    Name = {Url = "http://C"}
}

local doc = parquet.schema("Document")
doc:add_column("DocId", "required", "int64")

local links = doc:add_group("Links", "optional")
links:add_column("Backward", "repeated", "int64")
links:add_column("Forward", "repeated", "int64")

local name = doc:add_group("Name", "repeated")
local language = name:add_group("Language", "repeated")
language:add_column("Code", "required", "binary")
language:add_column("Country", "optional", "binary")
name:add_column("Url", "optional", "binary")
doc:finalize()

local writer = parquet.writer("example.parquet", doc, {
    created_by = "hindsight",
    enable_statistics = true,
    columns = {
        ["Name.Url"] = {compression = "gzip"}
    }
})

writer:dissect_record(r1)
writer:write_rowgroup() -- writes out the first row group
                        -- normally one would just close at the end (writing out
                        -- a single row group)
writer:dissect_record(r2)
writer:close() -- writes out a second row group with the remaining record

2.2. Functions

2.2.1. schema

Creates a parquet schema for the writer.

local doc = parquet.schema("Document", hive_compatible)

Arguments

name (string) - Parquet schema name
hive_compatible (bool, nil/none default: false) - When true the Parquet column names are coverted to snake case (alphanumeric and underscore only)

Return

schema (userdata) or an error is thrown

2.2.2. writer

Creates a Parquet writer.

local writer = parque.writer("foo.parquet", schema, properties)

Arguments

filename (string) - Filename of the output
schema (userdata) - Parquet schema

properties (table, nil/none) - Writer properties

  {
      enable_dictionary = bool,
      dictionary_pagesize_limit = int64,
      write_batch_size = int64,
      data_pagesize = int64,
      version = string, -- ("1.0", "2.0")
      created_by = string,
      encoding = string, -- ("plain", "plain_dictionary", "rle", "bit_packed", "delta_binary_packed",
                         -- "delta_length_byte_array", "delta_byte_array", "rle_dictionary")
      compression = string, -- ("uncompressed", "snappy", "gzip", "lzo", "brotli")
      enable_statistics = bool,

      columns = {
          col_name1 = {
              enable_dictionary = bool,
              encoding = string,
              compression = string,
              enable_statistics = bool
          },
          ["col.nested.nameN"] = {}
      }
  }

Return

writer (userdata) or an error is thrown

2.2.3. version

Returns a string with the running version of the Parquet module.

require "parquet"
local v = parquet.version()
-- v == "0.0.5"

Arguments

none

Return

Semantic version string

2.3. Schema/Group Methods

2.3.1. add_group

Adds a structure to the schema.

local links = doc:add_group("Links", "optional", logical)

Arguments

name (string)
repetition ("required", "optional", "repeated")
logical_type (string, nil/none default: "none") - see add_column for the full list

Return

group (userdata) or throws an error

2.3.2. add_column

Adds a data column to the schema.

doc:add_column("DocId", "required", "int64", logical, flba_len, precision, scale)

Arguments

name (string)
repetition (string) - "requried", "optional", "repeated"
data_type (string) - "boolean", "int32", "int64", "int96", "float", "double", "binary", "fixed_len_byte_array"
logical_type (string, nil/none default: "none") - "none", "utf8", "map", "list", "enum", "decimal", "date", "time_millis", "time_micros", "timestamp_millis", "timestamp_micros", "uint_8", "uint_16", "uint_32", "uint_64", "int_8", "int_16", "int_32", "int_64", "json", "bson", "interval" see: LogicalTypes (MAP_KEY_VALUE is no longer used)
flba_len (int, nil/none) - fixed length byte array length
precision (int, nil/none) - decimal precision
scale (int, nil/none) - decimal scale

Return

none

2.4. Schema Methods

2.4.1. finalize

Builds the schema structure after it has been completely defined. The schema must be finalized before it is passed to the writer. Also, once the schema is finalized it cannot be modified (an error will be thrown).

doc:finalize()

Arguments

none

Return

none or throws an error

2.5. Writer Methods

2.5.1. dissect_record

Dissects a record into columns based on the schema.

writer:dissect_record(record)

Arguments

record (table) - structure matching the schema

Return

none or throws an error if the structure does not match the schema (fields that exist in the record but are not specified in the schema are ignored)

2.5.2. dissect_message (Heka sandbox only)

Dissects a message into columns based on the schema.

writer:dissect_message()

Arguments

none

Return

none or throws an error if the structure does not match the schema (fields that exist in the record but are not specified in the schema are ignored)

2.5.3. write_rowgroup

Writes the currently collected data out as a row group.

writer:write_rowgroup()

Arguments

none

Return

none or throws an error if the write fails

2.5.4. close

Closes the writer flushing any remaining data in the rowgroup.

writer:close()

Arguments

none

Return

none or throws an error on failure

parquet (0.0.11)

Parquet Lua Module

1. Overview

2. Module

2.1. Example Usage

2.2. Functions

2.2.1. schema

2.2.2. writer

2.2.3. version

2.3. Schema/Group Methods

2.3.1. add_group

2.3.2. add_column

2.4. Schema Methods

2.4.1. finalize

2.5. Writer Methods

2.5.1. dissect_record

2.5.2. dissect_message (Heka sandbox only)

2.5.3. write_rowgroup

2.5.4. close

results matching ""

No results matching ""