RumbleDB now supports user-defined array and object types both with the JSound compact syntax and the JSound verbose syntax.
RumbleDB user-defined types can be defined with the JSound syntax. A tutorial for the JSound syntax can be found here.
For now, RumbleDB only allows the definition of user-defined types for objects and arrays. User-defined atomic types and union types will follow soon. The @ (primary key) and ? (nullable) shortcuts are supported as of version 2.0.5. The behavior of nulls with absent vs. nullable fields can be tweaked in the configuration (e.g., if a null is present in an optional, non-nullable field, RumbleBD can be lenient and simply remove it instead of throwing an error).
The implementation is still experimental and bugs are still expected, which we will appreciate to be informed of.
A new type can be declared in the prolog, at the same location where you also define global variables and user-defined functions.
In the above query, although the type is defined, the query returns an object that was not validated against this type.
To validate and annotate a sequence of objects, you need to use the validate-type expression, like so:
You can use user-defined types wherever other types can appear: as type annotation for FLWOR variables or global variables, as function parameter or return types, in instance-of or treat-as expressions, etc.
You can validate larger sequences
You can also validate, in parallel, an entire JSON Lines file, like so:
By defaults, fields are optional:
You can, however, make a field required by adding a ! in front of its name:
Or you can provide a default value with the equal sign:
Extra fields will be rejected. However, the verbose version of JSound supports allowing extra fields (open objects) and will be supported in a future version of RumbleDB.
With the JSound comptact syntax, you can easily define nested array structures:
You can even further nest objects:
Or split your definitions into several types that refer to each other:
In fact, RumbleDB will internally convert the sequence of objects to a Spark DataFrame, leading to faster execution times.
In other words, the JSound Compact Schema Syntax is perfect for defining DataFrames schema!
For advanced JSound features, such as open object types or subtypes, the verbose syntax must be used, like so:
The JSound type system, as its name indicates, is sound: you can only make subtypes more restrictive than the super type. The complete specification of both syntaxes is available on the .
In the feature, RumbleDB will support user-defined atomic types and union types via the verbose syntax.
Once you have validated your data as a dataframe with a user-defined type, you are all set to use the RumbleDB ML Machine Learning library and feed it through ML pipelines!
declare type local:my-type as {
"foo" : "string",
"bar" : "integer"
};
{ "foo" : "this is a string", "bar" : 42 }declare type local:my-type as {
"foo" : "string",
"bar" : "integer"
};
validate type local:my-type* {
{ "foo" : "this is a string", "bar" : 42 }
}declare type local:my-type as {
"foo" : "string",
"bar" : "integer"
};
declare function local:proj($x as local:my-type+) as string*
{
$x.foo
};
let $a as local:my-type* := validate type local:my-type* {
{ "foo" : "this is a string", "bar" : 42 }
}
return if($a instance of local:my-type*)
then local:proj($a)
else "Not an instance."declare type local:my-type as {
"foo" : "string",
"bar" : "integer"
};
validate type local:my-type* {
{ "foo" : "this is a string", "bar" : 42 },
{ "foo" : "this is another string", "bar" : 1 },
{ "foo" : "this is yet another string", "bar" : 2 },
{ "foo" : "this is a string", "bar" : 12 },
{ "foo" : "this is a string", "bar" : 42345 },
{ "foo" : "this is a string", "bar" : 42 }
}declare type local:my-type as {
"foo" : "string",
"bar" : "integer"
};
validate type local:my-type* {
json-lines("hdfs:///directory-file.json")
}declare type local:my-type as {
"foo" : "string",
"bar" : "integer"
};
validate type local:my-type* {
{ "foo" : "this is a string", "bar" : 42 },
{ "bar" : 1 },
{ "foo" : "this is yet another string", "bar" : 2 },
{ "foo" : "this is a string" },
{ "foo" : "this is a string", "bar" : 42345 },
{ "foo" : "this is a string", "bar" : 42 }
}declare type local:my-type as {
"foo" : "string",
"!bar" : "integer"
};
validate type local:my-type* {
{ "foo" : "this is a string", "bar" : 42 },
{ "bar" : 1 },
{ "foo" : "this is yet another string", "bar" : 2 },
{ "foo" : "this is a string", "bar" : 1234 },
{ "foo" : "this is a string", "bar" : 42345 },
{ "foo" : "this is a string", "bar" : 42 }
}declare type local:my-type as {
"foo" : "string=foobar",
"!bar" : "integer"
};
validate type local:my-type* {
{ "foo" : "this is a string", "bar" : 42 },
{ "bar" : 1 },
{ "foo" : "this is yet another string", "bar" : 2 },
{ "foo" : "this is a string", "bar" : 1234 },
{ "foo" : "this is a string", "bar" : 42345 },
{ "foo" : "this is a string", "bar" : 42 }
}declare type local:my-type as {
"foo" : "string",
"!bar" : [ "integer" ]
};
validate type local:my-type* {
{ "foo" : "this is a string", "bar" : [ 42, 1234 ] },
{ "bar" : [ 1 ] },
{ "foo" : "this is yet another string", "bar" : [ 2 ] },
{ "foo" : "this is a string", "bar" : [ ] },
{ "foo" : "this is a string", "bar" : [ 1, 2, 3, 4, 5, 6 ] },
{ "foo" : "this is a string", "bar" : [ 42 ] }
}declare type local:my-type as {
"foo" : { "bar" : "integer" },
"!bar" : [ { "first" : "string", "last" : "string" } ]
};
validate type local:my-type* {
{
"foo" : { "bar" : 1 },
"bar" : [
{ "first" : "Albert", "last" : "Einstein" },
{ "first" : "Erwin", "last" : "Schrodinger" }
]
},
{
"foo" : { "bar" : 2 },
"bar" : [
{ "first" : "Alan", "last" : "Turing" },
{ "first" : "John", "last" : "Von Neumann" }
]
},
{
"foo" : { "bar" : 3 },
"bar" : [
]
}
}declare type local:person as {
"first" : "string",
"last" : "string"
};
declare type local:my-type as {
"foo" : { "bar" : "integer" },
"!bar" : [ "local:person" ]
};
validate type local:my-type* {
{
"foo" : { "bar" : 1 },
"bar" : [
{ "first" : "Albert", "last" : "Einstein" },
{ "first" : "Erwin", "last" : "Schrodinger" }
]
},
{
"foo" : { "bar" : 2 },
"bar" : [
{ "first" : "Alan", "last" : "Turing" },
{ "first" : "John", "last" : "Von Neumann" }
]
},
{
"foo" : { "bar" : 3 },
"bar" : [
]
}
}declare type local:x as jsound verbose {
"kind" : "object",
"baseType" : "object",
"content" : [
{ "name" : "foo", "type" : "integer" }
],
"closed" : false
};
declare type local:y as jsound verbose {
"kind" : "object",
"baseType" : "local:x",
"content" : [
{ "name" : "bar", "type" : "date" }
],
"closed" : true
};We list here the most important functions supported by RumbleDB, and introduce them by means of examples. Highly detailed specifications can be found in the underlying W3C standard, unless the function is marked as specific to JSON or RumbleDB, in which case it can be found here. JSONiq and RumbleDB intentionally do not support builtin functions on XML nodes, NOTATION or QNames. RumbleDB supports almost all other W3C-standardized functions, please contact us if you are still missing one.
For the sake of ease of use, all W3C standard builtin functions and JSONiq builtin functions are in the RumbleDB namespace, which is the default function namespace and does not require any prefix in front of function names.
It is recommended that user-defined functions are put in the local namespace, i.e., their name should have the local: prefix (which is predefined). Otherwise, there is the risk that your code becomes incompatible with subsequent releases if new (unprefixed) builtin functions are introduced.
Fully implemented
returns (1, 2, 3) and logs it in the log-path if specified
Fully implemented
returns 2.0
Fully implemented
returns 3.0
Fully implemented
returns 2.0
Fully implemented
returns 2.0
returns 2.23
Fully implemented
Fully implemented
returns 15 as a double
returns NaN as a double
returns 15 as a double
Not implemented
##Formatting numbers
Not implemented
##Trigonometric and exponential functions
###pi
Fully implemented
returns 3.141592653589793
###exp
Fully implemented
###exp10
Fully implemented
Fully implemented
Fully implemented
Fully implemented
Fully implemented
returns 2
Fully implemented
Fully implemented
JSONiq-specific. Fully implemented
JSONiq-specific. Fully implemented
Fully implemented
Fully implemented
Fully implemented
Fully implemented
Fully implemented
Not implemented
Fully implemented
returns (84, 104, 233, 114, 232, 115, 101)
returns ()
Fully implemented
returns "अशॊक"
returns ""
Fully implemented
returns -1
Fully implemented
returns true
returns ()
Not implemented
Not implemented
Fully implemented
returns "foobarfoobar"
Fully implemented
returns "foobarfoobar"
returns "foo-bar-foobar"
Fully implemented
returns "bar"
returns "ba"
Fully implemented
Returns the length of the supplied string, or 0 if the empty sequence is supplied.
returns 3.
returns 0.
###normalize-space
Fully implemented
Normalization of spaces in a string.
returns "The wealthy curled darlings of our nation."
Fully implemented
Returns the value of the input after applying Unicode normalization.
returns the unicode-normalized version of the input string. Normalization forms NFC, NFD, NFKC, and NFKD are supported. "FULLY-NORMALIZED" though supported, should be used with caution as only the composition exclusion characters supported FULLY-NORMALIZED are which are uncommented in the .
Fully implemented
returns "ABCD0"
Fully implemented
returns "abc!d"
Fully implemented
returns "BAr"
returns "AAA"
Fully implemented
returns true.
Fully implemented
returns true
Fully implemented
returns true.
Fully implemented
returns "foo"
returns "f"
Fully implemented
returns "bar"
returns ""
Arity 2 implemented, arity 3 is not.
Regular expression matching. The semantics of regular expressions are those of Java's Pattern class.
returns true.
returns true.
Arity 3 implemented, arity 4 is not.
Regular expression matching and replacing. The semantics of regular expressions are those of Java's Pattern class.
returns "a*cada*"
returns "abbraccaddabbra"
Arity 2 implemented, arity 3 is not.
returns ("aa", "bb", "cc", "dd")
returns ("aa", "bb", "cc", "dd")
Not implemented
Fully implemented
returns http://www.examples.com/examples
Fully implemented
returns 100%25%20organic
Not implemented
Not implemented
Fully implemented
returns true
Fully implemented
returns false
Fully implemented
returns true
returns false
Fully implemented
returns false
returns true
Fully implemented
returns 2021.
Fully implemented
returns 6.
Fully implemented
returns 17.
Fully implemented
returns 12.
Fully implemented
returns 35.
Fully implemented
returns 30.
Fully implemented
returns 2004-04-12T13:20:00+14:00
Fully implemented
returns 2021.
Fully implemented
returns 04.
Fully implemented
returns 12.
Fully implemented
returns 13.
Fully implemented
returns 20.
Fully implemented
returns 32.
Fully implemented
returns PT2H.
Fully implemented
returns 2021.
Fully implemented
returns 6.
Fully implemented
returns 4.
Fully implemented
returns -PT14H.
Fully implemented
returns 13.
Fully implemented
returns 20.
Fully implemented
returns 32.123.
Fully implemented
returns PT2H.
Fully implemented
returns 2004-04-12T03:25:15+04:05.
Fully implemented
returns 2014-03-12+04:00.
Fully implemented
returns 04:20:00-14:00.
The functions in this section accept a simplified version of the picture string, in which a variable marker accepts only:
One of the following component specifiers: Y, M, d, D, F, H, m, s, P
A first presentation modifier, for which the value can be:
Nn, for all supported component specifiers, besides P
N, if the component specifier is P
Fully implemented
returns 20-13-12-4-2004
Fully implemented
returns 12-4-2004
Fully implemented
returns 13-20-0
Not implemented
Fully implemented
Returns a boolean whether the input sequence is empty or not.
returns false.
Fully implemented
Returns a boolean whether the input sequence has at least one item or not.
returns true.
returns false.
This is pushed down to Spark and works on big sequences.
Fully implemented
Returns the first item of a sequence, or the empty sequence if it is empty.
returns 1.
returns ().
This is pushed down to Spark and works on big sequences.
Fully implemented
Returns all but the last item of a sequence, or the empty sequence if it is empty.
returns (2, 3, 4, 5).
returns ().
This is pushed down to Spark and works on big sequences.
Fully implemented
returns (1, 2, 3, 4, 5).
Fully implemented
returns (1, 2).
Fully implemented
returns (3, 2, 1).
Fully implemented
returns (2, 3).
Fully implemented
returns (1, 2, 3).
Fully implemented
Eliminates duplicates from a sequence of atomic items.
returns (1, 4, 3, "foo", true, 5).
This is pushed down to Spark and works on big sequences.
Fully implemented
returns 3.
returns "".
Fully implemented
returns true.
returns false.
Fully implemented
returns "a".
returns an error.
Fully implemented
returns "a".
returns an error.
Fully implemented
returns "a".
returns an error.
Fully implemented
returns 4.
Count calls are pushed down to Spark, so this works on billions of items as well:
Fully implemented
returns 2.5.
Avg calls are pushed down to Spark, so this works on billions of items as well:
Fully implemented
returns 4.
returns (1, 2, 3).
Max calls are pushed down to Spark, so this works on billions of items as well:
Fully implemented
returns 1.
returns (1, 2, 3).
Min calls are pushed down to Spark, so this works on billions of items as well:
Fully implemented
returns 10.
Sum calls are pushed down to Spark, so this works on billions of items as well:
Fully implemented
Returns the corresponding document node
Not implemented
Fully implemented
Serializes the supplied input sequence, returning the serialized representation of the sequence as a string
returns { "hello" : "world" }
Fully implemented
returns 5
Fully implemented
returns 10
returns 10
Fully implemented
returns 2020-02-26T11:22:48.423+01:00
Fully implemented
returns 2020-02-26Europe/Zurich
Fully implemented
returns 11:24:10.064+01:00
Fully implemented
returns PT1H.
Fully implemented
returns http://www.w3.org/2005/xpath-functions/collation/codepoint.
Not implemented
Not implemented
Not implemented
Not implemented
Not implemented
Not implemented
Not implemented
Not implemented
Fully implemented
returns ("foo", "bar"). Also works on an input sequence, eliminating duplicates
Keys calls are pushed down to Spark, so this works on billions of items as well:
Fully implemented
This function returns the members as an array, but not recursively, i.e., nested arrays are not unboxed.
Returns the first 100 integers as a sequence. Also works on an input sequence, in a distributive way.
Fully implemented
Returns a JSON null (also available as the literal null).
Fully implemented
Fully implemented
returns 100. Also works if the empty sequence is supplied, in which case it returns the empty sequence.
Fully implemented
returns
Fully implemented
returns
Fully implemented
returns
Fully implemented
returns
Fully implemented
Unboxes arrays recursively, stopping the recursion when any other item is reached (object or atomic). Also works on an input sequence, in a distributive way.
Returns (1, 2, 3, 4, 5, 6, 7, 8, 9).
Fully implemented
returns
Fully implemented
returns the object {"foo" : "bar", "bar" : "foobar"}. Also works on an input sequence, in a distributive way.
Fully implemented
returns the object {"foobar" : "foo"}. Also works on an input sequence, in a distributive way.
Fully implemented
returns ("bar", "foobar"). Also works on an input sequence, in a distributive way.
Values calls are pushed down to Spark, so this works on billions of items as well:
Not implemented
Not implemented
returns the (unique) JSON value parsed from a local JSON (but not necessarily JSON Lines) file where this value may be spread over multiple lines.
a format token that indicates a numbering sequence of the the following form: '0001'
A second presentation modifier, for which the value can be t or c, which are also the default values
A width modifier, both minimum and maximum values
trace(1 to 3)abs(-2)ceiling(2.3)floor(2.3)round(2.3)round(2.2345, 2)round-half-to-even(2.2345, 2), round-half-to-even(2.2345)number("15")number("foo")number(15)pi()exp(10)exp10(10)log(100)log10(100)pow(10, 2)sqrt(4)sin(pi())cos(pi())cosh(pi())sinh(pi())tan(pi())asin(1)acos(1)atan(1)atan2(1)string-to-codepoints("Thérèse")string-to-codepoints("")codepoints-to-string((2309, 2358, 2378, 2325))codepoints-to-string(())compare("aa", "bb")codepoint-equal("abcd", "abcd")codepoint-equal("", ())concat("foo", "bar", "foobar")string-join(("foo", "bar", "foobar"))string-join(("foo", "bar", "foobar"), "-")substring("foobar", 4)substring("foobar", 4, 2)string-length("foo")string-length(())normalize-space(" The wealthy curled darlings of our nation. "),normalize-unicode("hello world", "NFC")upper-case("abCd0")lower-case("ABc!D")translate("bar","abc","ABC")translate("--aaa--","abc-","ABC")contains("foobar", "ob")starts-with("foobar", "foo")ends-with("foobar", "bar")substring-before("foobar", "bar")substring-before("foobar", "o")substring-after("foobar", "foo")substring-after("foobar", "r")matches("foobar", "o+")matches("foobar", "^fo+.*")replace("abracadabra", "bra", "*")replace("abracadabra", "a(.)", "a$1$1")tokenize("aa bb cc dd")tokenize("aa;bb;cc;dd", ";")string(resolve-uri("examples","http://www.examples.com/"))encode-for-uri("100% organic")fn:true()fn:false()boolean(9)boolean("")not(9)boolean("")years-from-duration(duration("P2021Y6M"))months-from-duration(duration("P2021Y6M"))days-from-duration(duration("P2021Y6M17D"))hours-from-duration(duration("P2021Y6M17DT12H35M30S"))minutes-from-duration(duration("P2021Y6M17DT12H35M30S"))minutes-from-duration(duration("P2021Y6M17DT12H35M30S"))dateTime("2004-04-12T13:20:00+14:00")year-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))month-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))day-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))hours-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))minutes-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))seconds-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))timezone-from-dateTime(dateTime("2021-04-12T13:20:32.123+02:00"))year-from-date(date("2021-06-04"))month-from-date(date("2021-06-04"))day-from-date(date("2021-06-04"))timezone-from-date(date("2021-06-04-14:00"))hours-from-time(time("13:20:32.123+02:00"))minutes-from-time(time("13:20:32.123+02:00"))seconds-from-time(time("13:20:32.123+02:00"))timezone-from-time(time("13:20:32.123+02:00"))adjust-dateTime-to-timezone(dateTime("2004-04-12T13:20:15+14:00"), dayTimeDuration("PT4H5M"))adjust-date-to-timezone(date("2014-03-12"), dayTimeDuration("PT4H"))adjust-time-to-timezone(time("13:20:00-05:00"), dayTimeDuration("-PT14H"))format-dateTime(dateTime("2004-04-12T13:20:00"), "[m]-[H]-[D]-[M]-[Y]")format-date(date("2004-04-12"), "[D]-[M]-[Y]")format-time(time("13:20:00"), "[H]-[m]-[s]")empty(1 to 10)exists(1 to 10)exists(())exists(json-lines("file.json"))head(1 to 10)head(())head(json-lines("file.json"))tail(1 to 5)tail(())tail(json-lines("file.json"))insert-before((3, 4, 5), 0, (1, 2))remove((1, 2, 10), 3)remove((1, 2, 3))subsequence((1, 2, 3), 2, 5)unordered((1, 2, 3))distinct-values((1, 1, 4, 3, 1, 1, "foo", 4, "foo", true, 3, 1, true, 5, 3, 1, 1))distinct-values(json-lines("file.json").foo)distinct-values(text-file("file.txt"))index-of((10, 20, 30, 40), 30)index-of((10, 20, 30, 40), 35)deep-equal((10, 20, "a"), (10, 20, "a"))deep-equal(("b", "0"), ("b", 0))zero-or-one(("a"))zero-or-one(("a", "b"))one-or-more(("a"))one-or-more(())exactly-one(("a"))exactly-one(("a", "b"))let $x := (1, 2, 3, 4)
return count($x)count(json-lines("file.json"))count(
for $i in json-lines("file.json")
where $i.foo eq "bar"
return $i
)let $x := (1, 2, 3, 4)
return avg($x)avg(json-lines("file.json").foo)let $x := (1, 2, 3, 4)
return max($x)for $i in 1 to 3
return max($i)max(json-lines("file.json").foo)let $x := (1, 2, 3, 4)
return min($x)for $i in 1 to 3
return min($i)min(json-lines("file.json").foo)let $x := (1, 2, 3, 4)
return sum($x)sum(json-lines("file.json").foo)doc("path/to/file.xml")serialize({hello: "world"})(1 to 10)[position() eq 5](1 to 10)[position() eq last()](1 to 10)[last()]current-dateTime()current-date()current-time()implicit-timezone()default-collation()keys({"foo" : "bar", "bar" : "foobar"})keys(({"foo" : "bar", "bar" : "foobar"}, {"foo": "bar2"}))keys(json-lines("file.json"))members([1 to 100])members(([1 to 100], [ 300 to 1000 ]))null()size([1 to 100])size(())accumulate(({ "b" : 2 }, { "c" : 3 }, { "b" : [1, "abc"] }, {"c" : {"d" : 0.17}})){ "b" : [ 2, [ 1, "abc" ] ], "c" : [ 3, { "d" : 0.17 } ] }descendant-arrays(([0, "x", { "a" : [1, {"b" : 2}, [2.5]], "o" : {"c" : 3} }]))[ 0, "x", { "a" : [ 1, { "b" : 2 }, [ 2.5 ] ], "o" : {"c" : 3} } ]
[ 1, { "b" : 2 }, [ 2.5 ] ]
[ 2.5 ]descendant-objects(([0, "x", { "a" : [1, {"b" : 2}, [2.5]], "o" : {"c" : 3} }])){ "a" : [ 1, { "b" : 2 }, [ 2.5 ] ], "o" : { "c" : 3 } }
{ "b" : 2 }
{ "c" : 3 }descendant-pairs(({ "a" : [1, {"b" : 2}], "d" : {"c" : 3} })){ "a" : [ 1, { "b" : 2 } ] }
{ "b" : 2 }
{ "d" : { "c" : 3 } }
{ "c" : 3 }flatten(([1, 2], [[3, 4], [5, 6]], [7, [8, 9]]))intersect(({"a" : "abc", "b" : 2, "c" : [1, 2], "d" : "0"}, { "a" : 2, "b" : "ab", "c" : "foo" })){ "a" : [ "abc", 2 ], "b" : [ 2, "ab" ], "c" : [ [ 1, 2 ], "foo" ] }project({"foo" : "bar", "bar" : "foobar", "foobar" : "foo" }, ("foo", "bar"))project(({"foo" : "bar", "bar" : "foobar", "foobar" : "foo" }, {"foo": "bar2"}), ("foo", "bar"))remove-keys({"foo" : "bar", "bar" : "foobar", "foobar" : "foo" }, ("foo", "bar"))remove-keys(({"foo" : "bar", "bar" : "foobar", "foobar" : "foo" }, {"foo": "bar2"}), ("foo", "bar"))values({"foo" : "bar", "bar" : "foobar"})values(({"foo" : "bar", "bar" : "foobar"}, {"foo" : "bar2"}))values(json-lines("file.json"))json-doc("/Users/sheldon/object.json")RumbleDB relies on the JSONiq language.
The complete specification can be found here and on the JSONiq.org website. The implementation is now in a very advanced stage and there remain only few unsupported core JSONiq features.
A tutorial can be found . All queries in this tutorial will work with RumbleDB.
A tutorial aimed at Python users can be found . Please keep in mind, though, that examples using not supported features may not work (see below).
FLWOR expressions now support nestedness, for example like so:
However, keep in mind that parallelization cannot be nested in Spark (there cannot be a job within a job), that is, the following will not work:
Many expressions are pushed down to Spark out of the box. For example, this will work on a large file leveraging the parallelism of Spark:
What is pushed down so far is:
FLWOR expressions (as soon as a for clause is encountered, binding a variable to a sequence generated with json-lines() or parallelize())
aggregation functions such as count
JSON navigation expressions: object lookup (as well as keys() call), array lookup, array unboxing, filtering predicates
predicates on positions, include use of context-dependent functions position() and last(), e.g.,
More expressions working on sequences will be pushed down in the future, prioritized on the feedback we receive.
We also started to push down some expressions to DataFrames and Spark SQL (obtained via structured-json-lines, csv-file and parquet-file calls). In particular, keys() pushes down the schema lookup if used on parquet-file() and structured-json-lines(). Likewise, count() as well as object lookup, array unboxing and array lookup is also pushed down on DataFrames.
When an expression does not support pushdown, it will materialize automaticaly. To avoid issues, the materializion is capped by default at 200 items, but this can be changed on the command line with --materialization-cap. A warning is issued if a materialization happened and the sequence was truncated on screen. An error is thrown if this happens within a query.
Prologs with user-defined functions and global variables are supported. Global external variables are supported (use "--variable:foo bar" on the command line to assign values to them). If the declared type is not string, then the literal supplied on the command line is cast. If the declared type is anyURI, the path supplied on the command line is also resolved against the working directory to an absolute URI. Thus, anyURI should be used to supply paths dynamically through an external variable.
Context item declarations are supported and a global context item value can be passed with the "--context-item" or "-I" parameter on the command line.
Library modules are now supported (experimental, please report bugs), and their namespace URI is used for resolution. If it is relative, it is resolved against the importing module location.
The same schemes are supported as for reading queries and data: file, hdfs, and so on. HTTP is also supported: you can import modules from the Web!
Example of library module (the file name is library-module.jq):
Example of importing module (assuming it is in the same directory):
Try/catch expressions are supported. Error codes are in the default, RumbleDB namespace and do not need prefixes.
The JSONiq type system is fully supported. Below is a complete list of JSONiq types and their support status. All builtin types are in the default type namespace, so that no prefix is needed. These types are defined in the XML Schema standard. Note that some types specific to XML (e.g., NOTATION, NMTOKENS, NMTOKEN, ID, IDREF, ENTITY, etc) are not part of the JSONiq standard and not supported by RumbleDB.
Most core features of JSONiq are now in place, and we are working on getting the last (less used) ones into RumbleDB as well. We prioritize their implementation on user requests.
Some prolog settings (base URI, ordering mode, decimal format, namespace declarations) are not supported yet.
Location hints for the resolution of modules are not supported yet.
Window clauses are not supported, because they are not compatible with the Spark execution model.
Function type syntax is supported.
Function annotations are not supported (%public, %private...), but this is planned.
Most JSONiq and XQuery builtin functions are now supported (see function documentation), except XML-specific functions. A few are still missing, do not hesitate to reach out if you need them.
Constructors for atomic types are fully supported.
Buitin functions cannot yet be used with named function reference expressions (example: concat#2).
Error variables ($err:code, ...) for inside catch blocks are not supported.
There are future plans to support JSONiq updates and scripting.
type checking (instance of, treat as)
many builtin function calls (head, tail, exist, etc)
supported
dateTime
supported
dateTimeStamp
supported
dayTimeDuration
supported
decimal
supported
double
supported
duration
supported
float
supported
gDay
supported
gMonth
supported
gYear
supported
gYearMonth
supported
hexBinary
supported
int
supported
integer
supported
long
supported
negativeInteger
supported
nonPositiveInteger
supported
nonNegativeInteger
supported
numeric
supported
positiveInteger
supported
short
supported
string
supported
time
supported
unsignedByte
supported
unsignedInt
supported
unsignedLong
supported
unsignedShort
supported
yearMonthDuration
supported
atomic
JSONiq 1.0 only
anyAtomicType
supported
anyURI
supported
base64Binary
supported
boolean
supported
byte
supported
date
let $x := for $x in json-lines("file.json")
where $x.field eq "foo"
return $x
return count($x)for $x in json-lines("file1.json")
let $z := for $y in json-lines("file2.json")
where $y.foo eq $x.fbar
return $y
return count($z)count(json-lines("file.json")[$$.field eq "foo"].bar[].foo[[1]])json-lines("file.json")[position() ge 10 and position() le last() - 2]module namespace m = "library-module.jq";
declare variable $m:x := 2;
declare function mod:func($v) {
$m:x + $v
);import module namespace mod = "library-module.jq";
mod:func($mod:x)try { 1 div 0 } catch FOAR0001 { "Division by zero!" }The parameters that can be used on the command line as well as on the planned HTTP server are shown below. They are also accessible via the Java API and via Python through the RumbleRuntimeConfiguration class.
RumbleDB runs in three modes. You can select the mode passing a verb as the first parameter. For example:
Previous parameters (--shell, --query-path, --server) work in a backward compatible fashion, however we do recommend to start using the new verb-based format.
--shell
repl
spark-submit rumbledb.jar run file.jq -o output-dir -P 1
spark-submit rumbledb.jar run -q '1+1'
spark-submit rumbledb.jar serve -p 8001
spark-submit rumbledb.jar repl -c 10N/A
yes, no
yes runs the interactive shell. No executes a query specified with --query-path
--shell-filter
N/A
N/A
jq .
Post-processes the output of JSONiq queries on the shell with the specified command (reading the RumbleDB output via stdin)
--query
-q
query
1+1
A JSONiq query directly provided as a string.
--query-path
(any text without -- or - is recognized as a query path)
query-path
file:///folder/file.jq
A JSONiq query file to read from (from any file system, even the Web!).
--output-path
-o
output-path
file:///folder/output
Where to output to (if the output is large, it will create a sharded directory, otherwise it will create a file)
--output-format
-f
N/A
json, csv, avro, parquet, or any other format supported by Spark
An output format to use for the output. Formats other than json can only be output if the query outputs a highly structured sequence of objects (you can nest your query in an annotate() call to specify a schema if it does not).
--output-format-option:foo
N/A
N/A
bar
Options to further specify the output format (example: separator character for CSV, compression format...)
--overwrite
-O (meaning --overwrite yes)
overwrite
yes, no
Whether to overwrite to --output-path. No throws an error if the output file/folder exists.
--materialization-cap
-c
materialization-cap
100000
A cap on the maximum number of items to materialize during the query execution for large sequences within a query. For example, when nesting an expression producing a large sequence of items (and that RumbleDB chose to physically store as an RDD or DataFrame) into an array constructor.
--result-size
result-size
10
A cap on the maximum number of items to output on the screen or to a local list.
--number-of-output-partitions
-P
N/A
ad hoc
How many partitions to create in the output, i.e., the number of files that will be created in the output path directory.
--log-path
N/A
log-path
file:///folder/log.txt
Where to output log information
--print-iterator-tree
N/A
N/A
yes, no
For debugging purposes, prints out the expression tree and runtime interator tree.
--show-error-info
-v (meaning --show-error-info yes)
show-error-info
yes, no
For debugging purposes. If you want to report a bug, you can use this to get the full exception stack. If no, then only a short message is shown in case of error.
--static-typing
-t (meaning --static-typing yes)
static-typing
yes, no
Activates static type analysis, which annotates the expression tree with inferred types at compile time and enables more optimizations (experimental). Deactivated by default.
--server
serve
N/A
yes, no
yes runs RumbleDB as a server on port 8001. Run queries with http://localhost:8001/jsoniq?query-path=/folder/foo.json
--port
-p
N/A
8001 (default)
Changes the port of the RumbleDB HTTP server to any of your liking
--host
-h
N/A
localhost (default)
Changes the host of the RumbleDB HTTP server to any of your liking
--variable:foo
N/A
variable:foo
bar
--variable:foo bar initialize the global variable $foo to "bar". The query must contain the corresponding global variable declaration, e.g., "declare variable $foo external;"
--context-item
-I
context-item
bar
initializes the global context item $$ to "bar". The query must contain the corresponding global variable declaration, e.g., "declare context item external;"
--context-item-input
-i
context-item-input
-
reads the context item value from the standard input
--context-item-input-format
N/A
context-item-input-format
text or json
sets the input format to use for parsing the standard input (as text or as a serialized json value)
--dates-with-timezone
N/A
dates-with-timezone
yes or no
activates timezone support for the type xs:date (deactivated by default)
--lax-json-null-valication
N/A
lax-json-null-validation
yes or no
Allows conflating JSON nulls with absent values when validating nillable object fields for more flexibility (activated by default).
--optimize-general-comparison-to-value-comparison
N/A
optimize-general-comparison-to-value-comparison
yes or no
activates automatic conversion of general comparisons to value comparisons when applicable (activated by default)
--function-inlining
N/A
function-inlining
yes or no
activates function inlining for non-recursive functions (activated by default)
--parallel-execution
N/A
parallel-execution
yes or no
activates parallel execution when possible (activated by default)
--native-execution
N/A
native-execution
yes or no
activates native (Spark SQL) execution when possible (activated by default)
--default-language
N/A
N/A
jsoniq10, jsoniq31, xquery31
specifies the query language to be used
--optimize-steps
N/A
N/A
yes or no
allows RumbleDB to optimize steps, might violate stability of document order (activated by default)
--optimize-steps-experimental
N/A
N/A
yes or no
experimentally optimizes steps more by skipping uniqueness and sorting in some cases. correctness is not yet verified (disabled by default)
--optimize-parent-pointers
N/A
N/A
yes or no
allows RumbleDB to remove parent pointers from items if no steps requiring parent pointers are detected statically (activated by default)
--static-base-uri
N/A
N/A
"../data/"
sets the static base uri for the execution. This option overwrites module location but is overwritten by declaration inside query
RumbleDB ML is a Machine Learning library built on top of the RumbleDB engine that makes it more productive and easier to perform ML tasks thanks to the abstraction layer provided by JSONiq.
The machine learning capabilities are exposed through JSONiq function items. The concepts of "estimator" and "transformer", which are core to Machine Learning, are naturally function items and fit seamlessly in the JSONiq data model.
Training sets, test sets, and validation sets, which contain features and labels, are exposed through JSONiq sequences of object items: the keys of these objects are the features and labels.
The names of the estimators and of the transformers, as well as the functionality they encapsulate, are directly inherited from the SparkML library which RumbleDB ML is based on: we chose not to reinvent the wheel.
A transformer is a function item that maps a sequence of objects to a sequence of objects.
It is an abstraction that either performs a feature transformation or generates predictions based on trained models. For example:
Tokenizer is a feature transformer that receives textual input data and splits it into individual terms (usually words), which are called tokens.
KMeansModel is a trained model and a transformer that can read a dataset containing features and generate predictions as its output.
An estimator is a function item that maps a sequence of objects to a transformer (yes, you got it right: that's a function item returned by a function item. This is why they are also called higher-order functions!).
Estimators abstract the concept of a Machine Learning algorithm or any algorithm that fits or trains on data. For example, a learning algorithm such as KMeans is implemented as an Estimator. Calling this estimator on data essentially trains a KMeansModel, which is a Model and hence a Transformer.
Transformers and estimators are function items in the RumbleDB Data Model. Their first argument is the sequence of objects that represents, for example, the training set or test set. Parameters can be provided as their second argument. This second argument is expected to be an object item. The machine learning parameters form the fields of the said object item as key-value pairs.
RumbleDB ML works on highly structured data, because it requires full type information for all the fields in the training set or test set. It is on our development plan to automate the detection of these types when the sequence of objects gets created in the fly.
RumbleDB supports a user-defined type system with which you can validate and annotate datasets against a JSound schema.
This annotation is required to be applied on any dataset that must be used as input to RumbleDB ML, but it is superfluous if the data was directly read from a structured input format such as Parquet, CSV, Avro, SVM or ROOT.
Tokenizer Example:
KMeans Example:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
Parameters:
declare type local:id-and-sentence as {
"id": "integer",
"sentence": "string"
};
let $local-data := (
{"id": 1, "sentence": "Hi I heard about Spark"},
{"id": 2, "sentence": "I wish Java could use case classes"},
{"id": 3, "sentence": "Logistic regression models are neat"}
)
let $df-data := validate type local:id-and-sentence* { $local-data }
let $transformer := get-transformer("Tokenizer")
for $i in $transformer(
$df-data,
{"inputCol": "sentence", "outputCol": "output"}
)
return $i
// returns
// { "id" : 1, "sentence" : "Hi I heard about Spark", "output" : [ "hi", "i", "heard", "about", "spark" ] }
// { "id" : 2, "sentence" : "I wish Java could use case classes", "output" : [ "i", "wish", "java", "could", "use", "case", "classes" ] }
// { "id" : 3, "sentence" : "Logistic regression models are neat", "output" : [ "logistic", "regression", "models", "are", "neat" ] }declare type local:col-1-2-3 as {
"id": "integer",
"col1": "decimal",
"col2": "decimal",
"col3": "decimal"
};
let $vector-assembler := get-transformer("VectorAssembler")(
?,
{ "inputCols" : [ "col1", "col2", "col3" ], "outputCol" : "features" }
)
let $local-data := (
{"id": 0, "col1": 0.0, "col2": 0.0, "col3": 0.0},
{"id": 1, "col1": 0.1, "col2": 0.1, "col3": 0.1},
{"id": 2, "col1": 0.2, "col2": 0.2, "col3": 0.2},
{"id": 3, "col1": 9.0, "col2": 9.0, "col3": 9.0},
{"id": 4, "col1": 9.1, "col2": 9.1, "col3": 9.1},
{"id": 5, "col1": 9.2, "col2": 9.2, "col3": 9.2}
)
let $df-data := validate type local:col-1-2-3* {$local-data }
let $df-data := $vector-assembler($df-data)
let $est := get-estimator("KMeans")
let $tra := $est(
$df-data,
{"featuresCol": "features"}
)
for $i in $tra(
$df-data,
{"featuresCol": "features"}
)
return $i
// returns
// { "id" : 0, "col1" : 0, "col2" : 0, "col3" : 0, "prediction" : 0 }
// { "id" : 1, "col1" : 0.1, "col2" : 0.1, "col3" : 0.1, "prediction" : 0 }
// { "id" : 2, "col1" : 0.2, "col2" : 0.2, "col3" : 0.2, "prediction" : 0 }
// { "id" : 3, "col1" : 9, "col2" : 9, "col3" : 9, "prediction" : 1 }
// { "id" : 4, "col1" : 9.1, "col2" : 9.1, "col3" : 9.1, "prediction" : 1 }
// { "id" : 5, "col1" : 9.2, "col2" : 9.2, "col3" : 9.2, "prediction" : 1 }- aggregationDepth: integer
- censorCol: string
- featuresCol: string
- fitIntercept: boolean
- labelCol: string
- maxIter: integer
- predictionCol: string
- quantileProbabilities: array (of double)
- quantilesCol: string
- tol: double- alpha: double
- checkpointInterval: integer
- coldStartStrategy: string
- finalStorageLevel: string
- implicitPrefs: boolean
- intermediateStorageLevel: string
- itemCol: string
- maxIter: integer
- nonnegative: boolean
- numBlocks: integer
- numItemBlocks: integer
- numUserBlocks: integer
- predictionCol: string
- rank: integer
- ratingCol: string
- regParam: double
- seed: double
- userCol: string- distanceMeasure: string
- featuresCol: string
- k: integer
- maxIter: integer
- minDivisibleClusterSize: double
- predictionCol: string
- seed: double- bucketLength: double
- inputCol: string
- numHashTables: integer
- outputCol: string
- seed: double- fdr: double
- featuresCol: string
- fpr: double
- fwe: double
- labelCol: string
- numTopFeatures: integer
- outputCol: string
- percentile: double
- selectorType: string- binary: boolean
- inputCol: string
- maxDF: double
- minDF: double
- minTF: double
- outputCol: string
- vocabSize: integer- collectSubModels: boolean
- estimator: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- numFolds: integer
- parallelism: integer
- seed: double- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- impurity: string
- labelCol: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- thresholds: array (of double)- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- impurity: string
- labelCol: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- predictionCol: string
- seed: double
- varianceCol: string- itemsCol: string
- minConfidence: double
- minSupport: double
- numPartitions: integer
- predictionCol: string- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- labelCol: string
- lossType: string
- maxBins: integer
- maxDepth: integer
- maxIter: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- stepSize: double
- subsamplingRate: double
- thresholds: array (of double)
- validationIndicatorCol: string- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- labelCol: string
- lossType: string
- maxBins: integer
- maxDepth: integer
- maxIter: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- predictionCol: string
- seed: double
- stepSize: double
- subsamplingRate: double
- validationIndicatorCol: string- featuresCol: string
- k: integer
- maxIter: integer
- predictionCol: string
- probabilityCol: string
- seed: double
- tol: double- family: string
- featuresCol: string
- fitIntercept: boolean
- labelCol: string
- link: string
- linkPower: double
- linkPredictionCol: string
- maxIter: integer
- offsetCol: string
- predictionCol: string
- regParam: double
- solver: string
- tol: double
- variancePower: double
- weightCol: string- inputCol: string
- minDocFreq: integer
- outputCol: string- inputCols: array (of string)
- missingValue: double
- outputCols: array (of string)
- strategy: string- featureIndex: integer
- featuresCol: string
- isotonic: boolean
- labelCol: string
- predictionCol: string
- weightCol: string- distanceMeasure: string
- featuresCol: string
- initMode: string
- initSteps: integer
- k: integer
- maxIter: integer
- predictionCol: string
- seed: double
- tol: double- checkpointInterval: integer
- docConcentration: double
- docConcentration: array (of double)
- featuresCol: string
- k: integer
- keepLastCheckpoint: boolean
- learningDecay: double
- learningOffset: double
- maxIter: integer
- optimizeDocConcentration: boolean
- optimizer: string
- seed: double
- subsamplingRate: double
- topicConcentration: double
- topicDistributionCol: string- aggregationDepth: integer
- elasticNetParam: double
- epsilon: double
- featuresCol: string
- fitIntercept: boolean
- labelCol: string
- loss: string
- maxIter: integer
- predictionCol: string
- regParam: double
- solver: string
- standardization: boolean
- tol: double
- weightCol: string- aggregationDepth: integer
- featuresCol: string
- fitIntercept: boolean
- labelCol: string
- maxIter: integer
- predictionCol: string
- rawPredictionCol: string
- regParam: double
- standardization: boolean
- threshold: double
- tol: double
- weightCol: string- aggregationDepth: integer
- elasticNetParam: double
- family: string
- featuresCol: string
- fitIntercept: boolean
- labelCol: string
- lowerBoundsOnCoefficients: object (of object of double)
- lowerBoundsOnIntercepts: object (of double)
- maxIter: integer
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- regParam: double
- standardization: boolean
- threshold: double
- thresholds: array (of double)
- tol: double
- upperBoundsOnCoefficients: object (of object of double)
- upperBoundsOnIntercepts: object (of double)
- weightCol: string- inputCol: string
- outputCol: string- inputCol: string
- numHashTables: integer
- outputCol: string
- seed: double- inputCol: string
- max: double
- min: double
- outputCol: string- blockSize: integer
- featuresCol: string
- initialWeights: object (of double)
- labelCol: string
- layers: array (of integer)
- maxIter: integer
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- solver: string
- stepSize: double
- thresholds: array (of double)
- tol: double- featuresCol: string
- labelCol: string
- modelType: string
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- smoothing: double
- thresholds: array (of double)
- weightCol: string- dropLast: boolean
- handleInvalid: string
- inputCols: array (of string)
- outputCols: array (of string)- featuresCol: string
- labelCol: string
- parallelism: integer
- predictionCol: string
- rawPredictionCol: string
- weightCol: string- inputCol: string
- k: integer
- outputCol: string- handleInvalid: string
- inputCol: string
- inputCols: array (of string)
- numBuckets: integer
- numBucketsArray: array (of integer)
- outputCol: string
- outputCols: array (of string)
- relativeError: double- featuresCol: string
- forceIndexLabel: boolean
- formula: string
- handleInvalid: string
- labelCol: string
- stringIndexerOrderType: string- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- labelCol: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- numTrees: integer
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- subsamplingRate: double
- thresholds: array (of double)- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- labelCol: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- numTrees: integer
- predictionCol: string
- seed: double
- subsamplingRate: double- inputCol: string
- outputCol: string
- withMean: boolean
- withStd: boolean- handleInvalid: string
- inputCol: string
- outputCol: string
- stringOrderType: string- collectSubModels: boolean
- estimator: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- parallelism: integer
- seed: double
- trainRatio: double- handleInvalid: string
- inputCol: string
- maxCategories: integer
- outputCol: string- inputCol: string
- maxIter: integer
- maxSentenceLength: integer
- minCount: integer
- numPartitions: integer
- outputCol: string
- seed: double
- stepSize: double
- vectorSize: integer
- windowSize: integer- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- quantileProbabilities: array (of double)
- quantilesCol: string- coldStartStrategy: string
- itemCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- userCol: string- inputCol: string
- outputCol: string
- threshold: double- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- handleInvalid: string
- inputCol: string
- inputCols: array (of string)
- outputCol: string
- outputCols: array (of string)
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- splits: array (of double)
- splitsArray: array (of array of double)- featuresCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- binary: boolean
- inputCol: string
- minTF: double
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- inputCol: string
- inverse: boolean
- outputCol: string- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- impurity: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- thresholds: array (of double)- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- impurity: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- seed: double
- varianceCol: string- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- seed: double
- topicDistributionCol: string- inputCol: string
- outputCol: string
- scalingVec: object (of double)- itemsCol: string
- minConfidence: double
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string- categoricalCols: array (of string)
- inputCols: array (of string)
- numFeatures: integer
- outputCol: string- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- maxBins: integer
- maxDepth: integer
- maxIter: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- stepSize: double
- subsamplingRate: double
- thresholds: array (of double)- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- maxBins: integer
- maxDepth: integer
- maxIter: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- seed: double
- stepSize: double
- subsamplingRate: double- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string- featuresCol: string
- linkPredictionCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string- binary: boolean
- inputCol: string
- numFeatures: integer
- outputCol: string- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- inputCols: array (of string)
- outputCols: array (of string)
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- inputCol: string
- labels: array (of string)
- outputCol: string- inputCols: array (of string)
- outputCol: string- featureIndex: integer
- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- rawPredictionCol: string
- threshold: double
- weightCol: double- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- seed: double
- topicDistributionCol: string- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- threshold: double
- thresholds: array (of double)- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- inputCol: string
- max: double
- min: double
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- thresholds: array (of double)- inputCol: string
- n: integer
- outputCol: string- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- thresholds: array (of double)- inputCol: string
- outputCol: string
- p: double- dropLast: boolean
- inputCol: string
- outputCol: string- dropLast: boolean
- handleInvalid: string
- inputCols: array (of string)
- outputCols: array (of string)
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- featuresCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- rawPredictionCol: string- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- degree: integer
- inputCol: string
- outputCol: string- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- numTrees: integer
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- probabilityCol: string
- rawPredictionCol: string
- seed: double
- subsamplingRate: double
- thresholds: array (of double)- cacheNodeIds: boolean
- checkpointInterval: integer
- featuresCol: string
- featureSubsetStrategy: string
- impurity: string
- maxBins: integer
- maxDepth: integer
- maxMemoryInMB: integer
- minInfoGain: double
- minInstancesPerNode: integer
- numTrees: integer
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)
- predictionCol: string
- seed: double
- subsamplingRate: double- gaps: boolean
- inputCol: string
- minTokenLength: integer
- outputCol: string
- pattern: string
- toLowercase: boolean- statement: string- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- caseSensitive: boolean
- inputCol: string
- locale: string
- outputCol: string
- stopWords: array (of string)- handleInvalid: string
- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- inputCol: string
- outputCol: string- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- handleInvalid: string
- inputCols: array (of string)
- outputCol: string- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)- handleInvalid: string
- inputCol: string
- size: integer- indices: array (of integer)
- inputCol: string
- names: array (of string)
- outputCol: string- inputCol: string
- outputCol: string
- parent: estimator (i.e., function(object*, object) as function(object*, object) as object*)