Skip to content

Commit

Permalink
remove merge_households from population and families functions
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Sep 12, 2024
1 parent 750e6e6 commit a63d657
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 49 deletions.
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# censobr v0.3.29999 dev

* Major changes
* Some functions (`read_population`, `read_mortality`, `read_families`, `read_emigration`) now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Closes [#31](https:/ipeaGIT/censobr/issues/31)
* Some functions (`read_mortality`, `read_emigration`) now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Partially closes [#31](https:/ipeaGIT/censobr/issues/31)
* {censobr} now imports the {duckplyr} package, which is used for merging household data. Closes issue [#31](https:/ipeaGIT/censobr/issues/31).
* New vignette showing how to work with larger-than-memory data. Closes [#42](https:/ipeaGIT/censobr/issues/42). The vignette still needs to be expanded with more examples, though.

Expand Down
14 changes: 14 additions & 0 deletions R/merge_household.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@ merge_household_var <- function(df,
if (year == 1970) {
key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
'code_region', 'name_region', 'id_household')
key_key <- 'id_household'
}

if (year == 1980) {
key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
'code_region', 'name_region', 'V6', 'V601')
key_key <- 'V601'

# rename weight var
df_household <- dplyr::rename(df_household, 'V603_household' = 'V603')
Expand All @@ -37,19 +39,22 @@ merge_household_var <- function(df,
key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
'code_region', 'name_region', 'V0109')

key_key <- 'V0109'
# rename weight var
df_household <- dplyr::rename(df_household, 'V7300_household' = 'V7300')
}

if (year == 2000) {
key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
'code_region', 'name_region', 'code_weighting', 'V0300')
key_key <- 'V0300'
}

if (year == 2010) {
key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
'code_region', 'name_region', 'code_weighting', 'V0300')

key_key <- 'V0300'
# rename weight var
df_household <- dplyr::rename(df_household, 'V0010_household' = 'V0010') |>
dplyr::compute()
Expand All @@ -62,6 +67,15 @@ merge_household_var <- function(df,
df_household <- dplyr::select(df_household, -all_of(vars_to_drop)) |>
dplyr::compute()

# # pre-filter right-hand table that matches key in left-hand table
# this improves performance a bit
df <- dplyr::compute(df)
key_values <- as.vector(unique(df$GetColumnByName(key_key)))
df_household <- dplyr::filter(df_household, get(key_key) %in% key_values) |>
dplyr::compute()

# nrow(df_household)
# [1] 6192332

# convert to duckdb
# df <- arrow::to_duckdb(df)
Expand Down
18 changes: 8 additions & 10 deletions R/read_families.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#' @param year Numeric. Year of reference in the format `yyyy`. Defaults to `2000`.
#' @template columns
#' @template add_labels
#' @template merge_households
#' @template as_data_frame
#' @template showProgress
#' @template cache
Expand All @@ -24,7 +23,6 @@
read_families <- function(year = 2000,
columns = NULL,
add_labels = NULL,
merge_households = FALSE,
as_data_frame = FALSE,
showProgress = TRUE,
cache = TRUE){
Expand All @@ -33,7 +31,7 @@ read_families <- function(year = 2000,
checkmate::assert_numeric(year)
checkmate::assert_vector(columns, null.ok = TRUE)
checkmate::assert_logical(as_data_frame)
checkmate::assert_logical(merge_households)
# checkmate::assert_logical(merge_households)
checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)

# data available for the years:
Expand All @@ -58,13 +56,13 @@ read_families <- function(year = 2000,
### read data
df <- arrow_open_dataset(local_file)

### merge household data
if (isTRUE(merge_households)) {
df <- merge_household_var(df,
year = year,
add_labels = add_labels,
showProgress)
}
# ### merge household data
# if (isTRUE(merge_households)) {
# df <- merge_household_var(df,
# year = year,
# add_labels = add_labels,
# showProgress)
# }

### Select
if (!is.null(columns)) { # columns <- c('V0002','V0011')
Expand Down
18 changes: 8 additions & 10 deletions R/read_population.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#' @template year
#' @template columns
#' @template add_labels
#' @template merge_households
#' @template as_data_frame
#' @template showProgress
#' @template cache
Expand All @@ -24,7 +23,6 @@
read_population <- function(year = 2010,
columns = NULL,
add_labels = NULL,
merge_households = FALSE,
as_data_frame = FALSE,
showProgress = TRUE,
cache = TRUE){
Expand All @@ -33,7 +31,7 @@ read_population <- function(year = 2010,
checkmate::assert_numeric(year)
checkmate::assert_vector(columns, null.ok = TRUE)
checkmate::assert_logical(as_data_frame)
checkmate::assert_logical(merge_households)
# checkmate::assert_logical(merge_households)
checkmate::assert_string(add_labels, pattern = 'pt', null.ok = TRUE)

# data available for the years:
Expand All @@ -58,13 +56,13 @@ read_population <- function(year = 2010,
### read data
df <- arrow_open_dataset(local_file)

### merge household data
if (isTRUE(merge_households)) {
df <- merge_household_var(df,
year = year,
add_labels = add_labels,
showProgress = showProgress)
}
# ### merge household data
# if (isTRUE(merge_households)) {
# df <- merge_household_var(df,
# year = year,
# add_labels = add_labels,
# showProgress = showProgress)
# }

### Select
if (!is.null(columns)) { # columns <- c('V0002','V0011')
Expand Down
4 changes: 0 additions & 4 deletions man/read_families.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions man/read_population.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 9 additions & 9 deletions tests/testthat/test_read_families.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ test_that("read_families read", {

# Merge households vars -----------------------

test_that("families merge_households_vars", {

for(y in c(2000)){ # y = 2000
message(y)
df_hou <- read_households(year = y)
df_test <- tester(year = y, merge_households = TRUE)
testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
}
})
# test_that("families merge_households_vars", {
#
# for(y in c(2000)){ # y = 2000
# message(y)
# df_hou <- read_households(year = y)
# df_test <- tester(year = y, merge_households = TRUE)
# testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
# }
# })


# ERRORS and messages -----------------------
Expand Down
22 changes: 11 additions & 11 deletions tests/testthat/test_read_population.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,17 @@ test_that("read_population check totals", {

# Merge households vars -----------------------

test_that("population merge_households_vars", {

for(y in c(1970, 1980, 1991, 2000, 2010)){ # y = 2010
message(y)
df_hou <- censobr::read_households(year = y)
df_test <- tester(year = y,
merge_households = TRUE,
showProgress = FALSE)
testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
}
})
# test_that("population merge_households_vars", {
#
# for(y in c(1970, 1980, 1991, 2000, 2010)){ # y = 2010
# message(y)
# df_hou <- censobr::read_households(year = y)
# df_test <- tester(year = y,
# merge_households = TRUE,
# showProgress = FALSE)
# testthat::expect_true( all(names(df_hou) %in% names(df_test)) )
# }
# })


# ERRORS and messages -----------------------
Expand Down

0 comments on commit a63d657

Please sign in to comment.