Retrieve the result of a validation/confrontation
Source:R/values.R
values-tbl_validation-method.Rd
Retrieve the result of a validation/confrontation.
Arguments
- x
tbl_validation()
, result of aconfront()
oftbl
with a rule set.- simplify
only use when
type
= "list" seevalidate::values
- drop
not used at the moment
- type
whether to return a list/matrix or to return a query on the database.
- sparse
whether to show the results as a sparse query (only fails and
NA
) or all results for each record.- ...
not used
Details
Since the validation is done on a database, there are multiple options for storing the result of the validation. The results show per record whether they are valid according to the validation rules supplied.
Use
compute
(seeconfront.tbl_sql()
) to store the result in the databaseUse
sparse
to only calculate "fails" and "missings"
Default type "tbl" is that everything is "lazy", so the query and/or storage has to
be done explicitly by the user.
The other types execute the query and retrieve the result into R. When this
creates memory problems, the tbl
option is to be preferred.
Results for type
:
tbl
: a dbplyr::tbl_dbi object, pointing to the databasematrix
: a R matrix, similar tovalidate::values()
.list
: a R list, similar tovalidate::values()
.data.frame
: the result oftbl
stored in adata.frame
.
See also
Other validation:
confront.tbl_sql()
,
tbl_validation-class
Examples
# create a table in a database
income <- data.frame(id = letters[1:2], age=c(12,35), salary = c(1000,NA))
con <- dbplyr::src_memdb()
tbl_income <- dplyr::copy_to(con, income, overwrite=TRUE)
print(tbl_income)
#> # Source: table<income> [?? x 3]
#> # Database: sqlite 3.38.5 [:memory:]
#> id age salary
#> <chr> <dbl> <dbl>
#> 1 a 12 1000
#> 2 b 35 NA
# Let's define a rule set and confront the table with it:
rules <- validator( is_adult = age >= 18
, has_income = salary > 0
, mean_age = mean(age,na.rm=TRUE) > 20
)
# and confront! (we have to use a key, because a db...)
cf <- confront(tbl_income, rules, key = "id")
print(cf)
#> Object of class 'tbl_validation'
#> Call:
#> confront.tbl_sql(tbl = dat, x = x, ref = ref, key = key, sparse = sparse)
#>
#> Confrontations: 3
#> Tbl : income (:memory:)
#> Key column : id
#> Sparse : FALSE
#> Fails : [??] (see `values`, `summary`)
#> Errors : 0
summary(cf)
#> name items npass nfail nNA warning error
#> is_adult is_adult 2 1 1 0 FALSE FALSE
#> has_income has_income 2 1 0 1 FALSE FALSE
#> mean_age mean_age 1 1 0 0 FALSE FALSE
#> expression
#> is_adult age - 18 >= -1e-08
#> has_income salary > 0
#> mean_age mean(age, na.rm = TRUE) > 20
# Values (i.e. validations on the table) can be retrieved like in `validate`
# with`type="matrix"` (simplify = TRUE)
values(cf, type = "matrix")
#> [[1]]
#> is_adult has_income
#> [1,] FALSE TRUE
#> [2,] TRUE NA
#>
#> [[2]]
#> mean_age
#> [1,] TRUE
#>
# But often this seems more handy:
values(cf, type = "tbl")
#> # Source: lazy query [?? x 4]
#> # Database: sqlite 3.38.5 [:memory:]
#> id is_adult has_income mean_age
#> <chr> <int> <int> <int>
#> 1 a 0 1 1
#> 2 b 1 NA 1
# We can see the sql code by using `show_query`:
show_query(cf)
#> <SQL>
#> SELECT `id`, CAST(`is_adult` AS BOOLEAN) AS `is_adult`, CAST(`has_income` AS BOOLEAN) AS `has_income`, CAST(`mean_age` AS BOOLEAN) AS `mean_age`
#> FROM (SELECT `id`, NULLIF(`is_adult`, -1) AS `is_adult`, NULLIF(`has_income`, -1) AS `has_income`, NULLIF(`mean_age`, -1) AS `mean_age`
#> FROM (SELECT `id`, MIN(`is_adult`) AS `is_adult`, MIN(`has_income`) AS `has_income`, MIN(`mean_age`) AS `mean_age`
#> FROM (SELECT `id`, CASE `rule` WHEN ('is_adult') THEN (COALESCE(1 - CAST(`fail` AS INTEGER), -1)) ELSE (1) END AS `is_adult`, CASE `rule` WHEN ('has_income') THEN (COALESCE(1 - CAST(`fail` AS INTEGER), -1)) ELSE (1) END AS `has_income`, CASE `rule` WHEN ('mean_age') THEN (COALESCE(1 - CAST(`fail` AS INTEGER), -1)) ELSE (1) END AS `mean_age`
#> FROM (SELECT `LHS`.`id` AS `id`, `rule`, `fail`
#> FROM (SELECT `id`
#> FROM `income`) AS `LHS`
#> LEFT JOIN (SELECT `id`, 'is_adult' AS `rule`, 1 AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM `income`)
#> WHERE (`age` - 18.0 < -1e-08)
#> UNION ALL
#> SELECT `id`, 'is_adult' AS `rule`, NULL AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM `income`)
#> WHERE (((`age`) IS NULL))
#> UNION ALL
#> SELECT `id`, 'has_income' AS `rule`, 1 AS `fail`
#> FROM (SELECT `id`, `salary`
#> FROM `income`)
#> WHERE (`salary` <= 0.0)
#> UNION ALL
#> SELECT `id`, 'has_income' AS `rule`, NULL AS `fail`
#> FROM (SELECT `id`, `salary`
#> FROM `income`)
#> WHERE (((`salary`) IS NULL))
#> UNION ALL
#> SELECT `id`, 'mean_age' AS `rule`, 1 AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM (SELECT `id`, `age`, AVG(`age`) OVER () AS `q01`
#> FROM (SELECT `id`, `age`
#> FROM `income`))
#> WHERE (`q01` <= 20.0))
#> UNION ALL
#> SELECT `id`, 'mean_age' AS `rule`, NULL AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM `income`)
#> WHERE (((`age`) IS NULL))) AS `RHS`
#> ON (`LHS`.`id` = `RHS`.`id`)
#> ))
#> GROUP BY `id`))
# identical
show_query(values(cf, type = "tbl"))
#> <SQL>
#> SELECT `id`, CAST(`is_adult` AS BOOLEAN) AS `is_adult`, CAST(`has_income` AS BOOLEAN) AS `has_income`, CAST(`mean_age` AS BOOLEAN) AS `mean_age`
#> FROM (SELECT `id`, NULLIF(`is_adult`, -1) AS `is_adult`, NULLIF(`has_income`, -1) AS `has_income`, NULLIF(`mean_age`, -1) AS `mean_age`
#> FROM (SELECT `id`, MIN(`is_adult`) AS `is_adult`, MIN(`has_income`) AS `has_income`, MIN(`mean_age`) AS `mean_age`
#> FROM (SELECT `id`, CASE `rule` WHEN ('is_adult') THEN (COALESCE(1 - CAST(`fail` AS INTEGER), -1)) ELSE (1) END AS `is_adult`, CASE `rule` WHEN ('has_income') THEN (COALESCE(1 - CAST(`fail` AS INTEGER), -1)) ELSE (1) END AS `has_income`, CASE `rule` WHEN ('mean_age') THEN (COALESCE(1 - CAST(`fail` AS INTEGER), -1)) ELSE (1) END AS `mean_age`
#> FROM (SELECT `LHS`.`id` AS `id`, `rule`, `fail`
#> FROM (SELECT `id`
#> FROM `income`) AS `LHS`
#> LEFT JOIN (SELECT `id`, 'is_adult' AS `rule`, 1 AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM `income`)
#> WHERE (`age` - 18.0 < -1e-08)
#> UNION ALL
#> SELECT `id`, 'is_adult' AS `rule`, NULL AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM `income`)
#> WHERE (((`age`) IS NULL))
#> UNION ALL
#> SELECT `id`, 'has_income' AS `rule`, 1 AS `fail`
#> FROM (SELECT `id`, `salary`
#> FROM `income`)
#> WHERE (`salary` <= 0.0)
#> UNION ALL
#> SELECT `id`, 'has_income' AS `rule`, NULL AS `fail`
#> FROM (SELECT `id`, `salary`
#> FROM `income`)
#> WHERE (((`salary`) IS NULL))
#> UNION ALL
#> SELECT `id`, 'mean_age' AS `rule`, 1 AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM (SELECT `id`, `age`, AVG(`age`) OVER () AS `q01`
#> FROM (SELECT `id`, `age`
#> FROM `income`))
#> WHERE (`q01` <= 20.0))
#> UNION ALL
#> SELECT `id`, 'mean_age' AS `rule`, NULL AS `fail`
#> FROM (SELECT `id`, `age`
#> FROM `income`)
#> WHERE (((`age`) IS NULL))) AS `RHS`
#> ON (`LHS`.`id` = `RHS`.`id`)
#> ))
#> GROUP BY `id`))
# sparse results in db (that the default)
values(cf, type="tbl", sparse=TRUE)
#> # Source: lazy query [?? x 3]
#> # Database: sqlite 3.38.5 [:memory:]
#> id rule fail
#> <chr> <chr> <int>
#> 1 a is_adult 1
#> 2 b has_income NA
# or if you like data.frames
values(cf, type="data.frame", sparse=TRUE)
#> id rule fail
#> 1 a is_adult TRUE
#> 2 b has_income NA