This tutorial shows how to use datamodelr to supplement R package documentation with relational diagrams.
Examples are using data frames from Lahman package.
There are > 20 data frames in the package. We can put them in a list dataframes
:
lahman <- new.env()
table_names <- data(package="Lahman")$results[,"Item"]
data(list = table_names, package = "Lahman", envir = lahman)
dataframes <- as.list(lahman)
str(dataframes, max.level = 1)
## List of 28
## $ Master :'data.frame': 19105 obs. of 26 variables:
## $ pitchingLabels :'data.frame': 30 obs. of 2 variables:
## $ Teams :'data.frame': 2835 obs. of 48 variables:
## $ Appearances :'data.frame': 102761 obs. of 21 variables:
## $ FieldingPost :'data.frame': 12714 obs. of 17 variables:
## $ fieldingLabels :'data.frame': 18 obs. of 2 variables:
## $ LahmanData :'data.frame': 24 obs. of 5 variables:
## $ Batting :'data.frame': 102816 obs. of 22 variables:
## $ AwardsSharePlayers :'data.frame': 6879 obs. of 7 variables:
## $ SeriesPost :'data.frame': 316 obs. of 9 variables:
## $ AwardsPlayers :'data.frame': 6158 obs. of 6 variables:
## $ battingLabels :'data.frame': 22 obs. of 2 variables:
## $ Managers :'data.frame': 3436 obs. of 10 variables:
## $ AwardsManagers :'data.frame': 179 obs. of 6 variables:
## $ Pitching :'data.frame': 44963 obs. of 30 variables:
## $ BattingPost :'data.frame': 13543 obs. of 22 variables:
## $ PitchingPost :'data.frame': 5271 obs. of 30 variables:
## $ Fielding :'data.frame': 136815 obs. of 18 variables:
## $ AwardsShareManagers:'data.frame': 425 obs. of 7 variables:
## $ FieldingOF :'data.frame': 12028 obs. of 6 variables:
## $ AllstarFull :'data.frame': 5148 obs. of 8 variables:
## $ TeamsHalf :'data.frame': 52 obs. of 10 variables:
## $ HallOfFame :'data.frame': 4156 obs. of 9 variables:
## $ Schools :'data.frame': 1207 obs. of 5 variables:
## $ ManagersHalf :'data.frame': 93 obs. of 10 variables:
## $ TeamsFranchises :'data.frame': 120 obs. of 4 variables:
## $ Salaries :'data.frame': 26428 obs. of 5 variables:
## $ CollegePlaying :'data.frame': 17350 obs. of 3 variables:
datamodelr can create data model object from a data frame with columns: table
, column
, key
, ref
and ref_col
(“meta data” structure). See ?as.data_model
for more info.
We can create meta data directly from the Lahman data frames:
library(dplyr)
library(purrr)
meta_data <-
dataframes %>%
map(~data_frame(
column = names(.),
type = map_chr(., ~paste(class(.), collapse = ",") ),
key = NA,
ref = NA,
ref_col = NA
)) %>%
bind_rows(.id = "table")
glimpse(meta_data)
## Observations: 350
## Variables: 6
## $ table <chr> "Master", "Master", "Master", "Master", "Master", "Mas...
## $ column <chr> "playerID", "birthYear", "birthMonth", "birthDay", "bi...
## $ type <chr> "character", "integer", "integer", "integer", "charact...
## $ key <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ ref <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ ref_col <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
It is nice to have column descriptions in data model. Source: http://seanlahman.com/files/database/readme2016.txt, section 2.0
column_descriptions <-
read.table("data/column_descriptions.csv",
header = TRUE, stringsAsFactors = FALSE, sep = ",")
meta_data <-
meta_data %>%
left_join(column_descriptions, by = c("table", "column")) %>%
mutate(description = substring(description, 1, 30))
Use as.data_model
to convert meta data to data model object:
library(datamodelr)
dm <- as.data_model(meta_data)
dm
## Data model object:
## 28 tables: Master, pitchingLabels, Teams, Appearances ...
## 350 columns
## no references
Create diagram for selected tables:
selected_tables = c("Master", "Batting", "Pitching", "Fielding", "Teams")
g <- dm_create_graph(dm, focus = list(tables = selected_tables),
col_attr = c("column", "type"))
dm_render_graph(g)
We could use dm_set_key
and dm_add_references
to add keys and references directly to a data model object. But, it looks verbose when managing large number of tables and references. Sometimes it is easier to manipulate the meta data before coercing it to the data model object.
# simple key rules structure;
key_rules = list(
list(
table = "Master",
column = "playerID"
),
list(
table = c("Batting", "Fielding", "Pitching"),
column = c("playerID", "yearID", "teamID", "lgID")
) ,
list(
table = "Teams",
column = c("yearID", "teamID", "lgID")
)
)
# use dplyr::mutate to update the key column
for(key_rule in key_rules) {
meta_data <-
mutate(meta_data,
key = ifelse(table %in% key_rule$table &
column %in% key_rule$column, TRUE, key))
}
dm <- as.data_model(meta_data)
Use “keys_only” view type to render diagram with only key columns:
g <- dm_create_graph(dm, focus = list(tables = selected_tables),
view_type = "keys_only", col_attr = c("column", "type"))
dm_render_graph(g)
Mark relations with column ref
(related table) and ref_cols
referenced columns if related table has multiple columns in a key.
In this example the relations to “Master” and “Teams” are created from “Batting”, “Fielding” and “Pitching”:
ref_rules <- list(
list(
table = c("Batting", "Fielding", "Pitching"),
column = "playerID",
ref = "Master"
),
list(
table = c("Batting", "Fielding", "Pitching"),
column = c("yearID", "teamID", "lgID"),
ref = "Teams"
)
)
# update meta data
for(ref_rule in ref_rules) {
meta_data <- mutate(
meta_data,
ref = ifelse(table %in% ref_rule$table & column %in% ref_rule$column,
ref_rule$ref, ref),
ref_col = ref # same column names in referenced table
)
}
dm <- as.data_model(meta_data)
dm
## Data model object:
## 28 tables: Master, pitchingLabels, Teams, Appearances ...
## 350 columns
## 6 references
g <- dm_create_graph(
dm,
focus = list(tables = selected_tables),
view_type = "keys_only",
col_attr = c("column", "type")
)
dm_render_graph(g)
With all columns selected, the diagram would be like:
g <- dm_create_graph(
dm,
focus = list(tables = selected_tables),
view_type = "all",
col_attr = c("column", "type", "description")
)
dm_render_graph(g)
Batting table:
DT::datatable(
dm$columns %>%
filter(table == "Batting") %>%
mutate(
column = ifelse(key, sprintf("<strong>%s</strong>", column), column)
) %>%
select(column, description, type, ref),
filter = "none",
escape = -1,
rownames = FALSE,
options = list(
pageLength = 100,
dom = 't'
)
)