These functions support flexible schema inspection both algorithmically and in human-friendly ways.

sdf_schema_json(x, parse_json = TRUE, simplify = FALSE,
  append_complex_type = TRUE)

sdf_schema_viewer(x, simplify = TRUE, append_complex_type = TRUE)

Arguments

x

An R object wrapping, or containing, a Spark DataFrame.

parse_json

Logical. If TRUE then the JSON return value will be parsed into an R list.

simplify

Logical. If TRUE then the schema will be folded into itself such that {"name" : "field1", "type" : {"type" : "array", "elementType" : "string", "containsNull" : true}, "nullable" : true, "metadata" : { } } will be rendered simply {"field1 (array)" : "[string]"}

append_complex_type

Logical. This only matters if parse_json=TRUE and simplify=TRUE. In that case indicators will be included in the return value for array and struct types.

See also

Examples

# NOT RUN {
library(testthat)
library(jsonlite)
library(sparklyr)
library(sparklyr.nested)
sample_json <- paste0(
  '{"aircraft_id":["string"],"phase_sequence":["string"],"phases (array)":{"start_point (struct)":',
  '{"segment_phase":["string"],"agl":["double"],"elevation":["double"],"time":["long"],',
  '"latitude":["double"],"longitude":["double"],"altitude":["double"],"course":["double"],',
  '"speed":["double"],"source_point_keys (array)":["[string]"],"primary_key":["string"]},',
  '"end_point (struct)":{"segment_phase":["string"],"agl":["double"],"elevation":["double"],',
  '"time":["long"],"latitude":["double"],"longitude":["double"],"altitude":["double"],',
  '"course":["double"],"speed":["double"],"source_point_keys (array)":["[string]"],',
  '"primary_key":["string"]},"phase":["string"],"primary_key":["string"]},"primary_key":["string"]}'
)

with_mock(
  # I am mocking functions so that the example works without a real spark connection
  spark_read_parquet = function(x, ...){return("this is a spark dataframe")},
  sdf_schema_json = function(x, ...){return(fromJSON(sample_json))},
  spark_connect = function(...){return("this is a spark connection")},

  # the meat of the example is here
  sc <- spark_connect(),
  spark_data <- spark_read_parquet(sc, path="path/to/data/*.parquet", name="some_name"),
  sdf_schema_viewer(spark_data)
)
# }