Skip to contents

The goal of dcmodifydb is to apply modification rules specified with dcmodify on a database table, allowing for documented, reproducable data cleaning adjustments in a database.

dcmodify separates intent from execution: a user specifies what, why and how of an automatic data change and uses dcmodifydb to execute them on a tbl database table.

Installation

The development version from GitHub can be installed with:

# install.packages("devtools")
devtools::install_github("data-cleaning/dcmodifydb")

Example

library(DBI)
library(dcmodify)
library(dcmodifydb)

# silly modification rules
m <- modifier( if (cyl == 6)  gear <- 10
             , gear[cyl == 4] <- 0  # this R syntax works too :-)
             , if (gear == 3) cyl <- 2
             )

# setting up a table in the database
con <- dbConnect(RSQLite::SQLite())
dbWriteTable(con, "mtcars", mtcars[,c("cyl", "gear")])
tbl_mtcars <- dplyr::tbl(con, "mtcars")

# "Houston, we have a table"
head(tbl_mtcars)
#> # Source:   lazy query [?? x 2]
#> # Database: sqlite 3.37.2 []
#>     cyl  gear
#>   <dbl> <dbl>
#> 1     6     4
#> 2     6     4
#> 3     4     4
#> 4     6     3
#> 5     8     3
#> 6     6     3

# lets modify on a copy of the table...
tbl_m <- modify(tbl_mtcars, m, copy=TRUE)

# and gear has changed...
head(tbl_m)
#> # Source:   lazy query [?? x 2]
#> # Database: sqlite 3.37.2 []
#>     cyl  gear
#>   <dbl> <dbl>
#> 1     6    10
#> 2     6    10
#> 3     4     0
#> 4     6    10
#> 5     2     3
#> 6     6    10

dbDisconnect(con)

Documented rules

You can use YAML to store the modification rules: “example.yml”

rules:
- expr: if (age > 130) age = 130L
  name: M1
  label: 'Maximum age'
  description: |
    Human age is limited. (can use  "=")
    Cap the age at 130
- expr: is.na(age) <- age < 0
  name: M2
  label: 'Unknown age'
  description: |
    Negative Age, nah...
    (set to NA)
- expr: income[age < 12] <- 0
  name: M3
  label: 'No Child Labor'
  description: |
    Children should not work. (R syntax)
    Set income to zero for children.
- expr: "retired <- age > 67"
  name: M4
  label: 'Retired'
  description: |
    Derive a new variable...
- expr: if (age < 18) age_class = 'child' else age_class = 'adult'
  name: M5
  label: 'Age class'
  description: |
    Derive a new variable with if else

Let’s load the rules and apply them to a data set:

m <- modifier(.file = "example.yml")
print(m)
#> Object of class modifier with 5 elements:
#> M1: Maximum age
#>   if (age > 130) age = 130
#> 
#> M2: Unknown age
#>   is.na(age) <- age < 0
#> 
#> M3: No Child Labor
#>   income[age < 12] <- 0
#> 
#> M4: Retired
#>   retired <- age > 67
#> 
#> M5: Age class
#>   if (age < 18) age_class = "child" else age_class = "adult"
# setup the data
"age, income
  11,   2000
 150,    300
  25,   2000
 -10,   2000
" -> csv
income <- read.csv(text = csv, strip.white = TRUE)
dbWriteTable(con, "income", income)
tbl_income <- dplyr::tbl(con, "income")

# this is the table in the data base
tbl_income
#> # Source:   table<income> [?? x 2]
#> # Database: sqlite 3.37.2 []
#>     age income
#>   <int>  <int>
#> 1    11   2000
#> 2   150    300
#> 3    25   2000
#> 4   -10   2000

# and now after modification
modify(tbl_income, m, copy = FALSE) 
#> # Source:   table<income> [?? x 2]
#> # Database: sqlite 3.37.2 []
#>     age income retired age_class
#>   <int>  <int>   <int> <chr>    
#> 1    11      0       0 child    
#> 2   130    300       1 adult    
#> 3    25   2000       0 adult    
#> 4    NA   2000      NA <NA>

Generated sql can be written with dump_sql

dump_sql(m, tbl_income, file = "modify.sql")

modify.sql:

-- -------------------------------------
-- Generated with dcmodifydb, do not edit
-- dcmodify version: 0.1.9
-- dcmodifydb version: 0.3.0.9001
-- dplyr version: 1.0.8
-- dbplyr version: 2.1.1
-- from: 'example/example.yml'
-- date: 2022-03-11
-- -------------------------------------


ALTER TABLE `income`
ADD `retired` INT;

ALTER TABLE `income`
ADD `age_class` TEXT;

-- M1: Maximum age
-- Human age is limited. (can use  "=")
-- Cap the age at 130
-- 
-- R expression: if (age > 130) age = 130
UPDATE `income`
SET `age` = 130
WHERE `age` > 130.0;

-- M2: Unknown age
-- Negative Age, nah...
-- (set to NA)
-- 
-- R expression: is.na(age) <- age < 0
UPDATE `income`
SET `age` = NULL
WHERE `age` < 0.0;

-- M3: No Child Labor
-- Children should not work. (R syntax)
-- Set income to zero for children.
-- 
-- R expression: income[age < 12] <- 0
UPDATE `income`
SET `income` = 0.0
WHERE `age` < 12.0;

-- M4: Retired
-- Derive a new variable...
-- 
-- R expression: retired <- age > 67
UPDATE `income`
SET `retired` = `age` > 67.0
;

-- M5: Age class
-- Derive a new variable with if else
-- 
-- R expression: if (age < 18) age_class = "child" else age_class = "adult"
UPDATE `income`
SET `age_class` = 'child'
WHERE `age` < 18.0;

UPDATE `income`
SET `age_class` = 'adult'
WHERE NOT(`age` < 18.0);

Note: Modification rules can be written to yaml with as_yaml and export_yaml.

dcmodify::export_yaml(m, "cleaning_steps.yml")