Add Factors to Data Based on Grouped Levels of an Existing Factor
add_grps.Rd
Add new factors to data based on grouped levels of an existing factor, using a key compatible with fct_collapse
.
Arguments
- data
a data frame, or a data frame extension (e.g. a
tibble
).- .fct
the quoted name of an existing (ungrouped)
factor
.- .key
a
list
of nested, named lists representing the groupings, each containing a series of namedcharacter vectors
.- .sort
logical
, whether to sort levels of new factors; defaultTRUE
.
Value
A data frame, or a data frame extension (e.g. a tibble
), equivalent to
data
with the additional grouped factor(s).
Details
The .key
argument should be a series of named lists nested within an outer list. Each nested named list must
contain one or more named character vector
s representing the new factor groupings. The nested lists should be
structured for compatibility with fct_collapse()
in package forcats.
add_grps()
will add new, grouped factors to data
, one for each nested list and with the same name. Levels are
assigned to these new grouped factors using the name of whichever character vector, if any, contains the old
factor level. If none does, the original ungrouped factor level is used.
Various different groupings of a factor
may be conveniently added to data
using add_grps()
and the corresponding series of related binomial glm
s compared using comp_glm()
.
See also
comp_glm()
, fct_collapse()
, list()
.
Other factor-manip:
fct_to_num()
Examples
(d <- binom_data(levels = 6))
#> __________________________
#> Simulated Binomial Data: -
#>
#> # A tibble: 6 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 33 33
#> 2 b 35 31
#> 3 c 20 46
#> 4 d 21 45
#> 5 e 8 58
#> 6 f 9 57
## One grouped factor
(grp_key <- list(g = c("a", "c", "e"), h = c("b", "d", "f")))
#> $g
#> [1] "a" "c" "e"
#>
#> $h
#> [1] "b" "d" "f"
#>
d |> add_grps(iv, list(iv2 = grp_key))
#> __________________________
#> Simulated Binomial Data: -
#>
#> # A tibble: 6 × 4
#> iv iv2 pn qn
#> <fct> <fct> <int> <int>
#> 1 a g 33 33
#> 2 b h 35 31
#> 3 c g 20 46
#> 4 d h 21 45
#> 5 e g 8 58
#> 6 f h 9 57
## Several grouped factors
grp_key <- list(
iv2 = grp_key,
iv3 = list(i = c("a", "b", "c"), j = c("d", "e", "f")),
iv4 = list(k = c("a", "b"), l = c("c", "d"), m = c("e", "f"))
)
d |> add_grps(iv, grp_key)
#> __________________________
#> Simulated Binomial Data: -
#>
#> # A tibble: 6 × 6
#> iv iv2 iv3 iv4 pn qn
#> <fct> <fct> <fct> <fct> <int> <int>
#> 1 a g i k 33 33
#> 2 b h i k 35 31
#> 3 c g i l 20 46
#> 4 d h j l 21 45
#> 5 e g j m 8 58
#> 6 f h j m 9 57
## Cut out the middleman
list(
iv2 = list(g = c("a", "c", "e"), h = c("b", "d", "f")),
iv3 = list(i = c("a", "b", "c"), j = c("d", "e", "f")),
iv4 = list(k = c("a", "b"), l = c("c", "d"), m = c("e", "f"))
) |>
add_grps(d, iv, .key = _)
#> __________________________
#> Simulated Binomial Data: -
#>
#> # A tibble: 6 × 6
#> iv iv2 iv3 iv4 pn qn
#> <fct> <fct> <fct> <fct> <int> <int>
#> 1 a g i k 33 33
#> 2 b h i k 35 31
#> 3 c g i l 20 46
#> 4 d h j l 21 45
#> 5 e g j m 8 58
#> 6 f h j m 9 57
## Binomial data with month as explanatory variable, using dplyr and forcats package functions
(d <- binom_data(12, probs = rep_len(0.5, 12)) |>
mutate(across(iv, \(x) fct_recode(x, !!!setNames(letters[1:12], month.abb)))) |>
rename(month = "iv"))
#> __________________________
#> Simulated Binomial Data: -
#>
#> # A tibble: 12 × 3
#> month pn qn
#> <fct> <int> <int>
#> 1 Jan 31 35
#> 2 Feb 28 38
#> 3 Mar 35 31
#> 4 Apr 33 33
#> 5 May 35 31
#> 6 Jun 36 30
#> 7 Jul 38 28
#> 8 Aug 29 37
#> 9 Sep 25 41
#> 10 Oct 35 31
#> 11 Nov 38 28
#> 12 Dec 28 38
## Name three lists of different month groupings using lapply()
(grp_key <- list(
list(1:3, 4:6, 7:9, 10:12),
list(1:4, 5:8, 9:12),
list(c(1:3, 10:12), 4:9)
) |>
lapply(\(x) lapply(x, \(y) month.abb[y])) |>
lapply(\(x) setNames(x, paste0("group", seq_along(x)))) |>
(\(x) setNames(x, paste0("months", seq_along(x))))())
#> $months1
#> $months1$group1
#> [1] "Jan" "Feb" "Mar"
#>
#> $months1$group2
#> [1] "Apr" "May" "Jun"
#>
#> $months1$group3
#> [1] "Jul" "Aug" "Sep"
#>
#> $months1$group4
#> [1] "Oct" "Nov" "Dec"
#>
#>
#> $months2
#> $months2$group1
#> [1] "Jan" "Feb" "Mar" "Apr"
#>
#> $months2$group2
#> [1] "May" "Jun" "Jul" "Aug"
#>
#> $months2$group3
#> [1] "Sep" "Oct" "Nov" "Dec"
#>
#>
#> $months3
#> $months3$group1
#> [1] "Jan" "Feb" "Mar" "Oct" "Nov" "Dec"
#>
#> $months3$group2
#> [1] "Apr" "May" "Jun" "Jul" "Aug" "Sep"
#>
#>
add_grps(d, month, grp_key) ## Add the new year groups to data
#> __________________________
#> Simulated Binomial Data: -
#>
#> # A tibble: 12 × 6
#> month months1 months2 months3 pn qn
#> <fct> <fct> <fct> <fct> <int> <int>
#> 1 Jan group1 group1 group1 31 35
#> 2 Feb group1 group1 group1 28 38
#> 3 Mar group1 group1 group1 35 31
#> 4 Apr group2 group1 group2 33 33
#> 5 May group2 group2 group2 35 31
#> 6 Jun group2 group2 group2 36 30
#> 7 Jul group3 group2 group2 38 28
#> 8 Aug group3 group2 group2 29 37
#> 9 Sep group3 group3 group2 25 41
#> 10 Oct group4 group3 group1 35 31
#> 11 Nov group4 group3 group1 38 28
#> 12 Dec group4 group3 group1 28 38
## Example from fct_collapse() using gss_cat dataset from {forcats} package
fct_count(gss_cat$partyid)
#> # A tibble: 10 × 2
#> f n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Strong republican 2314
#> 5 Not str republican 3032
#> 6 Ind,near rep 1791
#> 7 Independent 4119
#> 8 Ind,near dem 2499
#> 9 Not str democrat 3690
#> 10 Strong democrat 3490
grp_key <- list(
partyid2 = list(
missing = c("No answer", "Don't know"),
other = "Other party",
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)
)
gss_cat |>
add_grps(partyid, grp_key) |>
_$partyid2 |> fct_count()
#> # A tibble: 5 × 2
#> f n
#> <fct> <int>
#> 1 dem 7180
#> 2 ind 8409
#> 3 missing 155
#> 4 other 393
#> 5 rep 5346
gss_cat |>
add_grps(partyid, grp_key, .sort = FALSE) |>
_$partyid2 |> fct_count()
#> # A tibble: 5 × 2
#> f n
#> <fct> <int>
#> 1 missing 155
#> 2 other 393
#> 3 rep 5346
#> 4 ind 8409
#> 5 dem 7180
rm(grp_key, d)