About

This tutorial shows how to use datamodelr to supplement R package documentation with relational diagrams.

Examples are using data frames from Lahman package.

Extract meta data

There are > 20 data frames in the package. We can put them in a list dataframes:

lahman <- new.env()
table_names <- data(package="Lahman")$results[,"Item"]
data(list = table_names, package = "Lahman", envir = lahman)
dataframes <- as.list(lahman)
str(dataframes, max.level = 1)
## List of 28
##  $ Master             :'data.frame': 19105 obs. of  26 variables:
##  $ pitchingLabels     :'data.frame': 30 obs. of  2 variables:
##  $ Teams              :'data.frame': 2835 obs. of  48 variables:
##  $ Appearances        :'data.frame': 102761 obs. of  21 variables:
##  $ FieldingPost       :'data.frame': 12714 obs. of  17 variables:
##  $ fieldingLabels     :'data.frame': 18 obs. of  2 variables:
##  $ LahmanData         :'data.frame': 24 obs. of  5 variables:
##  $ Batting            :'data.frame': 102816 obs. of  22 variables:
##  $ AwardsSharePlayers :'data.frame': 6879 obs. of  7 variables:
##  $ SeriesPost         :'data.frame': 316 obs. of  9 variables:
##  $ AwardsPlayers      :'data.frame': 6158 obs. of  6 variables:
##  $ battingLabels      :'data.frame': 22 obs. of  2 variables:
##  $ Managers           :'data.frame': 3436 obs. of  10 variables:
##  $ AwardsManagers     :'data.frame': 179 obs. of  6 variables:
##  $ Pitching           :'data.frame': 44963 obs. of  30 variables:
##  $ BattingPost        :'data.frame': 13543 obs. of  22 variables:
##  $ PitchingPost       :'data.frame': 5271 obs. of  30 variables:
##  $ Fielding           :'data.frame': 136815 obs. of  18 variables:
##  $ AwardsShareManagers:'data.frame': 425 obs. of  7 variables:
##  $ FieldingOF         :'data.frame': 12028 obs. of  6 variables:
##  $ AllstarFull        :'data.frame': 5148 obs. of  8 variables:
##  $ TeamsHalf          :'data.frame': 52 obs. of  10 variables:
##  $ HallOfFame         :'data.frame': 4156 obs. of  9 variables:
##  $ Schools            :'data.frame': 1207 obs. of  5 variables:
##  $ ManagersHalf       :'data.frame': 93 obs. of  10 variables:
##  $ TeamsFranchises    :'data.frame': 120 obs. of  4 variables:
##  $ Salaries           :'data.frame': 26428 obs. of  5 variables:
##  $ CollegePlaying     :'data.frame': 17350 obs. of  3 variables:

datamodelr can create data model object from a data frame with columns: table, column, key, ref and ref_col (“meta data” structure). See ?as.data_model for more info.

We can create meta data directly from the Lahman data frames:

library(dplyr)
library(purrr)

meta_data <- 
  dataframes %>% 
  map(~data_frame(
    column = names(.),
    type = map_chr(., ~paste(class(.), collapse = ",") ),
    key = NA,
    ref = NA,
    ref_col = NA
  )) %>% 
  bind_rows(.id = "table") 

glimpse(meta_data)
## Observations: 350
## Variables: 6
## $ table   <chr> "Master", "Master", "Master", "Master", "Master", "Mas...
## $ column  <chr> "playerID", "birthYear", "birthMonth", "birthDay", "bi...
## $ type    <chr> "character", "integer", "integer", "integer", "charact...
## $ key     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ ref     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ ref_col <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

Column descriptions

It is nice to have column descriptions in data model. Source: http://seanlahman.com/files/database/readme2016.txt, section 2.0

column_descriptions <-
  read.table("data/column_descriptions.csv", 
             header = TRUE, stringsAsFactors = FALSE, sep = ",")

meta_data <-
  meta_data %>% 
  left_join(column_descriptions, by = c("table", "column")) %>% 
  mutate(description = substring(description, 1, 30))

Use as.data_model to convert meta data to data model object:

library(datamodelr)
dm <- as.data_model(meta_data)
dm
## Data model object:
##   28 tables:  Master, pitchingLabels, Teams, Appearances ... 
##   350 columns
##   no references

Create diagram for selected tables:

selected_tables = c("Master", "Batting", "Pitching", "Fielding", "Teams")
g <- dm_create_graph(dm, focus = list(tables = selected_tables), 
                     col_attr = c("column", "type"))
dm_render_graph(g)

Add keys

We could use dm_set_key and dm_add_references to add keys and references directly to a data model object. But, it looks verbose when managing large number of tables and references. Sometimes it is easier to manipulate the meta data before coercing it to the data model object.

# simple key rules structure;

key_rules = list(
  list(
    table = "Master",
    column = "playerID"
  ),
  list(
    table = c("Batting", "Fielding", "Pitching"),
    column = c("playerID", "yearID", "teamID", "lgID")
  ) , 
  list(
    table = "Teams",
    column = c("yearID", "teamID", "lgID")
  )  
)

# use dplyr::mutate to update the key column
for(key_rule in key_rules) {
  meta_data <- 
    mutate(meta_data, 
      key = ifelse(table %in% key_rule$table & 
        column %in% key_rule$column, TRUE, key))
}

dm <- as.data_model(meta_data)

Use “keys_only” view type to render diagram with only key columns:

g <- dm_create_graph(dm, focus = list(tables = selected_tables),
                     view_type = "keys_only", col_attr = c("column", "type"))

dm_render_graph(g)

Add references

Mark relations with column ref (related table) and ref_cols referenced columns if related table has multiple columns in a key.

In this example the relations to “Master” and “Teams” are created from “Batting”, “Fielding” and “Pitching”:

ref_rules <- list(
  list(
    table = c("Batting", "Fielding", "Pitching"),
    column = "playerID",
    ref = "Master"
  ),
  list(
    table = c("Batting", "Fielding", "Pitching"),
    column = c("yearID", "teamID", "lgID"),
    ref = "Teams"
  )
)


# update meta data
for(ref_rule in ref_rules) {
  meta_data <- mutate( 
    meta_data, 
    ref = ifelse(table %in% ref_rule$table & column %in% ref_rule$column, 
                 ref_rule$ref, ref),
    ref_col = ref # same column names in referenced table
  )  
}

dm <- as.data_model(meta_data)
dm
## Data model object:
##   28 tables:  Master, pitchingLabels, Teams, Appearances ... 
##   350 columns
##   6 references
g <- dm_create_graph(
  dm,
  focus = list(tables = selected_tables),
  view_type = "keys_only",
  col_attr = c("column", "type")
)

dm_render_graph(g)

With all columns selected, the diagram would be like:

g <- dm_create_graph(
  dm,
  focus = list(tables = selected_tables),
  view_type = "all",
  col_attr = c("column", "type", "description")
)

dm_render_graph(g)

Table overview

Batting table:

DT::datatable(
  dm$columns %>% 
    filter(table == "Batting") %>% 
    mutate(
      column = ifelse(key, sprintf("<strong>%s</strong>", column), column)
    ) %>% 
    select(column, description, type, ref), 
  filter = "none",
  escape = -1,
  rownames = FALSE,
  options = list(
    pageLength = 100,
    dom = 't'
  )
)