Remove Levels of Independent Variable where Dependent Variable All Success or All Failure
good_levels.Rd
good_levels()
identifies levels
of an independent variable for which values of a Bernoulli dependent
variable are neither all zero nor all one i.e., those for which \(0 < p < 1\).
drop_null()
drops all data with levels
of an independent variable for which a Bernoulli dependent
variable has values either all zero or all one i.e., those identified by good_levels()
.
drop_zero()
drops all data with levels
of an independent variable for which a binomial dependent variable
has either all successes or failures.
levels_data
returns the levels for all factors in data.
nlevels_data
returns the number of levels for all factors in data.
Usage
good_levels(.data, .dep_var, .ind_var)
drop_null(.data, .dep_var, .ind_var)
drop_zero(.data, .ind_var, .dep_var = cbind(.data$pn, .data$qn))
Arguments
- .data
a data frame, or a data frame extension (e.g. a
tibble
).- .dep_var
<
data-masking
> quoted name of a Bernoulli dependent variable that should benumeric
with values of 0 and 1; or in the case ofdrop zero()
, a binomial dependent variable, defaultcbind(.data$pn, .data$qn)
, representing the number of successes and failures respectively, seeglm()
.- .ind_var
<
data-masking
> quoted name of the independent variable, which may be afactor
, or acharacter vector
.
Value
good_levels()
returns a character vector comprising the levels
of .ind_var
for which the
corresponding values of .dep_var
are neither all zero nor all one. Both drop_null()
and
drop_zero()
return a data frame or a data frame extension e.g., a tibble
,
equivalent to data, including only rows with levels of .ind_var
for which .dep_var
values are neither
all zero nor all one, or neither having all successes nor all failures respectively.
Details
For a Bernoulli trial dataset with a numeric dependent variable coded as 0 or 1, good_levels()
identifies levels
of an independent variable for which values of the dependent variable are
neither all zero nor all one i.e., \(0 < p < 1\).
For a similar dataset, drop_null()
drops all rows of data other than those with levels
of the
independent variable identified by good_levels()
. Unused factor levels are dropped from the independent
variable.
For a binomial dataset, drop_zero()
drops rows of data having either all successes and no failures, or no
successes and all failures.
See also
binom_contingency
and levels
.
Other levels_data:
levels_data()
Examples
d <- bernoulli_data(probs = c(0.8, 0.4, 0, 0.3, 0.6 ))
d |> binom_contingency(dv)
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 50 16
#> 2 b 24 42
#> 3 c 0 66
#> 4 d 20 46
#> 5 e 50 16
d |> levels_data()
#> $iv
#> [1] "a" "b" "c" "d" "e"
#>
d |> good_levels(dv, iv)
#> [1] "a" "b" "d" "e"
d |> drop_null(dv, iv) |> levels_data()
#> $iv
#> [1] "a" "b" "d" "e"
#>
d |> drop_null(dv, iv) |> binom_contingency(dv)
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 4 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 50 16
#> 2 b 24 42
#> 3 d 20 46
#> 4 e 50 16
d |> binom_contingency(dv) |> drop_zero(iv)
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 4 × 3
#> iv pn qn
#> <fct> <int> <int>
#> 1 a 50 16
#> 2 b 24 42
#> 3 d 20 46
#> 4 e 50 16
identical(
d |> drop_null(dv, iv) |> binom_contingency(dv),
d |> binom_contingency(dv) |> drop_zero(iv)
)
#> [1] TRUE
d_ls <- map2(c(0.5, 0.4, 1, 1), c(0.1, 0, 0.6, 0), seq, length.out = 5) |>
lapply(\(x) bernoulli_data(probs = x)) |>
(\(x) setNames(x, paste0("data", seq_along(x))))()
d_ls |> lapply(\(d) d |> binom_contingency(dv))
#> $data1
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 36 30
#> 2 b 29 37
#> 3 c 23 43
#> 4 d 10 56
#> 5 e 11 55
#>
#> $data2
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 27 39
#> 2 b 24 42
#> 3 c 11 55
#> 4 d 5 61
#> 5 e 0 66
#>
#> $data3
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 66 0
#> 2 b 59 7
#> 3 c 45 21
#> 4 d 41 25
#> 5 e 40 26
#>
#> $data4
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 66 0
#> 2 b 56 10
#> 3 c 32 34
#> 4 d 9 57
#> 5 e 0 66
#>
d_ls |> lapply(levels_data)
#> $data1
#> $data1$iv
#> [1] "a" "b" "c" "d" "e"
#>
#>
#> $data2
#> $data2$iv
#> [1] "a" "b" "c" "d" "e"
#>
#>
#> $data3
#> $data3$iv
#> [1] "a" "b" "c" "d" "e"
#>
#>
#> $data4
#> $data4$iv
#> [1] "a" "b" "c" "d" "e"
#>
#>
d_ls |> lapply(\(d) d |> good_levels(dv, iv))
#> $data1
#> [1] "a" "b" "c" "d" "e"
#>
#> $data2
#> [1] "a" "b" "c" "d"
#>
#> $data3
#> [1] "b" "c" "d" "e"
#>
#> $data4
#> [1] "b" "c" "d"
#>
d_ls |> lapply(\(d) d |> drop_null(dv, iv) |> binom_contingency(dv))
#> $data1
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 36 30
#> 2 b 29 37
#> 3 c 23 43
#> 4 d 10 56
#> 5 e 11 55
#>
#> $data2
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 4 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 a 27 39
#> 2 b 24 42
#> 3 c 11 55
#> 4 d 5 61
#>
#> $data3
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 4 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 b 59 7
#> 2 c 45 21
#> 3 d 41 25
#> 4 e 40 26
#>
#> $data4
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 3 × 3
#> iv pn qn
#> * <fct> <int> <int>
#> 1 b 56 10
#> 2 c 32 34
#> 3 d 9 57
#>
d_ls |> lapply(\(d) d |> binom_contingency(dv) |> drop_zero(iv))
#> $data1
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 5 × 3
#> iv pn qn
#> <fct> <int> <int>
#> 1 a 36 30
#> 2 b 29 37
#> 3 c 23 43
#> 4 d 10 56
#> 5 e 11 55
#>
#> $data2
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 4 × 3
#> iv pn qn
#> <fct> <int> <int>
#> 1 a 27 39
#> 2 b 24 42
#> 3 c 11 55
#> 4 d 5 61
#>
#> $data3
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 4 × 3
#> iv pn qn
#> <fct> <int> <int>
#> 1 b 59 7
#> 2 c 45 21
#> 3 d 41 25
#> 4 e 40 26
#>
#> $data4
#> _____________________________
#> Binomial Contingency Table: -
#>
#> # A tibble: 3 × 3
#> iv pn qn
#> <fct> <int> <int>
#> 1 b 56 10
#> 2 c 32 34
#> 3 d 9 57
#>
identical(
d_ls |> lapply(\(d) d |> drop_null(dv, iv) |> binom_contingency(dv)),
d_ls |> lapply(\(d) d |> binom_contingency(dv) |> drop_zero(iv))
)
#> [1] TRUE
rm(d, d_ls)
## Using gss_cat dataset from {forcats} package
gss_cat |> names()
#> [1] "year" "marital" "age" "race" "rincome" "partyid" "relig"
#> [8] "denom" "tvhours"
gss_cat |> levels_data()
#> $marital
#> [1] "No answer" "Never married" "Separated" "Divorced"
#> [5] "Widowed" "Married"
#>
#> $race
#> [1] "Other" "Black" "White" "Not applicable"
#>
#> $rincome
#> [1] "No answer" "Don't know" "Refused" "$25000 or more"
#> [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
#> [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
#> [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
#>
#> $partyid
#> [1] "No answer" "Don't know" "Other party"
#> [4] "Strong republican" "Not str republican" "Ind,near rep"
#> [7] "Independent" "Ind,near dem" "Not str democrat"
#> [10] "Strong democrat"
#>
#> $relig
#> [1] "No answer" "Don't know"
#> [3] "Inter-nondenominational" "Native american"
#> [5] "Christian" "Orthodox-christian"
#> [7] "Moslem/islam" "Other eastern"
#> [9] "Hinduism" "Buddhism"
#> [11] "Other" "None"
#> [13] "Jewish" "Catholic"
#> [15] "Protestant" "Not applicable"
#>
#> $denom
#> [1] "No answer" "Don't know" "No denomination"
#> [4] "Other" "Episcopal" "Presbyterian-dk wh"
#> [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
#> [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
#> [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
#> [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
#> [19] "Other methodist" "United methodist" "Afr meth ep zion"
#> [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
#> [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
#> [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
#>
gss_cat |> nlevels_data()
#> marital race rincome partyid relig denom
#> 6 4 16 10 16 30