這份作業希望能夠讓你熟悉於利用 dprly 當中 select()
相關的 helper function 與 across()
。
這次的作業使用美國 Cencus Bureau 提供的普查資料,主要挑選人口變數與社經地位變數,並額外利用 2016 和 2020 總統大選選舉結果,練習不同資料表之間的串接,並查看可能影響選舉結果的變數。
### 這邊不要動
library(tidyverse)
##### import data
<- read_csv("data/Lab08/DEMOGRAPHIC AND HOUSING ESTIMATES.csv")
df_demo %>% head(1)
df_demo
<- read_csv("data/Lab08/SOCIAL CHARACTERISTICS.csv")
df_social %>% head(1)
df_social
<- read_csv("data/Lab08/df_vote_agg_2020.csv")
df_vote_agg_2020 %>% head(1) df_vote_agg_2020
#> # A tibble: 1 x 358
#> GEO_ID NAME DP05_0001E DP05_0001M DP05_0001PE DP05_0001PM DP05_0002E
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 id Geogr… Estimate!!S… Margin of E… Percent!!SE… Percent Marg… Estimate!!…
#> # … with 351 more variables: DP05_0002M <chr>, DP05_0002PE <chr>,
#> # DP05_0002PM <chr>, DP05_0003E <chr>, DP05_0003M <chr>, DP05_0003PE <chr>,
#> # DP05_0003PM <chr>, DP05_0004E <chr>, DP05_0004M <chr>, DP05_0004PE <chr>,
#> # DP05_0004PM <chr>, DP05_0005E <chr>, DP05_0005M <chr>, DP05_0005PE <chr>,
#> # DP05_0005PM <chr>, DP05_0006E <chr>, DP05_0006M <chr>, DP05_0006PE <chr>,
#> # DP05_0006PM <chr>, DP05_0007E <chr>, DP05_0007M <chr>, DP05_0007PE <chr>,
#> # DP05_0007PM <chr>, DP05_0008E <chr>, DP05_0008M <chr>, DP05_0008PE <chr>,
#> # DP05_0008PM <chr>, DP05_0009E <chr>, DP05_0009M <chr>, DP05_0009PE <chr>,
#> # DP05_0009PM <chr>, DP05_0010E <chr>, DP05_0010M <chr>, DP05_0010PE <chr>,
#> # DP05_0010PM <chr>, DP05_0011E <chr>, DP05_0011M <chr>, DP05_0011PE <chr>,
#> # DP05_0011PM <chr>, DP05_0012E <chr>, DP05_0012M <chr>, DP05_0012PE <chr>,
#> # DP05_0012PM <chr>, DP05_0013E <chr>, DP05_0013M <chr>, DP05_0013PE <chr>,
#> # DP05_0013PM <chr>, DP05_0014E <chr>, DP05_0014M <chr>, DP05_0014PE <chr>,
#> # DP05_0014PM <chr>, DP05_0015E <chr>, DP05_0015M <chr>, DP05_0015PE <chr>,
#> # DP05_0015PM <chr>, DP05_0016E <chr>, DP05_0016M <chr>, DP05_0016PE <chr>,
#> # DP05_0016PM <chr>, DP05_0017E <chr>, DP05_0017M <chr>, DP05_0017PE <chr>,
#> # DP05_0017PM <chr>, DP05_0018E <chr>, DP05_0018M <chr>, DP05_0018PE <chr>,
#> # DP05_0018PM <chr>, DP05_0019E <chr>, DP05_0019M <chr>, DP05_0019PE <chr>,
#> # DP05_0019PM <chr>, DP05_0020E <chr>, DP05_0020M <chr>, DP05_0020PE <chr>,
#> # DP05_0020PM <chr>, DP05_0021E <chr>, DP05_0021M <chr>, DP05_0021PE <chr>,
#> # DP05_0021PM <chr>, DP05_0022E <chr>, DP05_0022M <chr>, DP05_0022PE <chr>,
#> # DP05_0022PM <chr>, DP05_0023E <chr>, DP05_0023M <chr>, DP05_0023PE <chr>,
#> # DP05_0023PM <chr>, DP05_0024E <chr>, DP05_0024M <chr>, DP05_0024PE <chr>,
#> # DP05_0024PM <chr>, DP05_0025E <chr>, DP05_0025M <chr>, DP05_0025PE <chr>,
#> # DP05_0025PM <chr>, DP05_0026E <chr>, DP05_0026M <chr>, DP05_0026PE <chr>,
#> # DP05_0026PM <chr>, DP05_0027E <chr>, …
#> # A tibble: 1 x 614
#> GEO_ID NAME DP02_0001E DP02_0001M DP02_0001PE DP02_0001PM DP02_0002E
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 id Geogr… Estimate!!H… Margin of E… Percent!!HO… Percent Mar… Estimate!!H…
#> # … with 607 more variables: DP02_0002M <chr>, DP02_0002PE <chr>,
#> # DP02_0002PM <chr>, DP02_0003E <chr>, DP02_0003M <chr>, DP02_0003PE <chr>,
#> # DP02_0003PM <chr>, DP02_0004E <chr>, DP02_0004M <chr>, DP02_0004PE <chr>,
#> # DP02_0004PM <chr>, DP02_0005E <chr>, DP02_0005M <chr>, DP02_0005PE <chr>,
#> # DP02_0005PM <chr>, DP02_0006E <chr>, DP02_0006M <chr>, DP02_0006PE <chr>,
#> # DP02_0006PM <chr>, DP02_0007E <chr>, DP02_0007M <chr>, DP02_0007PE <chr>,
#> # DP02_0007PM <chr>, DP02_0008E <chr>, DP02_0008M <chr>, DP02_0008PE <chr>,
#> # DP02_0008PM <chr>, DP02_0009E <chr>, DP02_0009M <chr>, DP02_0009PE <chr>,
#> # DP02_0009PM <chr>, DP02_0010E <chr>, DP02_0010M <chr>, DP02_0010PE <chr>,
#> # DP02_0010PM <chr>, DP02_0011E <chr>, DP02_0011M <chr>, DP02_0011PE <chr>,
#> # DP02_0011PM <chr>, DP02_0012E <chr>, DP02_0012M <chr>, DP02_0012PE <chr>,
#> # DP02_0012PM <chr>, DP02_0013E <chr>, DP02_0013M <chr>, DP02_0013PE <chr>,
#> # DP02_0013PM <chr>, DP02_0014E <chr>, DP02_0014M <chr>, DP02_0014PE <chr>,
#> # DP02_0014PM <chr>, DP02_0015E <chr>, DP02_0015M <chr>, DP02_0015PE <chr>,
#> # DP02_0015PM <chr>, DP02_0016E <chr>, DP02_0016M <chr>, DP02_0016PE <chr>,
#> # DP02_0016PM <chr>, DP02_0017E <chr>, DP02_0017M <chr>, DP02_0017PE <chr>,
#> # DP02_0017PM <chr>, DP02_0018E <chr>, DP02_0018M <chr>, DP02_0018PE <chr>,
#> # DP02_0018PM <chr>, DP02_0019E <chr>, DP02_0019M <chr>, DP02_0019PE <chr>,
#> # DP02_0019PM <chr>, DP02_0020E <chr>, DP02_0020M <chr>, DP02_0020PE <chr>,
#> # DP02_0020PM <chr>, DP02_0021E <chr>, DP02_0021M <chr>, DP02_0021PE <chr>,
#> # DP02_0021PM <chr>, DP02_0022E <chr>, DP02_0022M <chr>, DP02_0022PE <chr>,
#> # DP02_0022PM <chr>, DP02_0023E <chr>, DP02_0023M <chr>, DP02_0023PE <chr>,
#> # DP02_0023PM <chr>, DP02_0024E <chr>, DP02_0024M <chr>, DP02_0024PE <chr>,
#> # DP02_0024PM <chr>, DP02_0025E <chr>, DP02_0025M <chr>, DP02_0025PE <chr>,
#> # DP02_0025PM <chr>, DP02_0026E <chr>, DP02_0026M <chr>, DP02_0026PE <chr>,
#> # DP02_0026PM <chr>, DP02_0027E <chr>, …
#> # A tibble: 1 x 9
#> state votes_gop votes_dem total_votes votes_gop_per votes_dem_per state_abbv
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 Alabama 1441168 849648 2323304 0.620 0.366 AL
#> # … with 2 more variables: votes_gop_per_2016 <dbl>, votes_dem_per_2016 <dbl>
從 df_demo
抓出男性人數、女性人數、男性佔比、女性佔比。
### your code
<- df_demo %>% select(matches("E")) %>% select(-matches("PE")) %>% select(GEO_ID:DP05_0004E)
df_gender <- df_gender %>% `colnames<-`(df_gender %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
df_gender str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
rename(pop = 3, pop_m = 4, pop_f= 5) %>% select(-6) %>%
filter(`geographic-area-name` !="Puerto Rico") %>%
slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
mutate(per_m = pop_m/pop, per_f = pop_f/pop)
%>% head(1) df_gender
#> # A tibble: 1 x 7
#> id `geographic-area-name` pop pop_m pop_f per_m per_f
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0400000US01 Alabama 4876250 2359355 2516895 0.484 0.516
從 df_demo
抓出投票人口、投票女性人口、投票男性人口、投票女性佔比、投票男性佔比。
### your code
<- df_demo %>% select(matches("E")) %>% select(-matches("PE")) %>% select(-(DP05_0001E:DP05_0085E))
df_vote <- df_vote %>% `colnames<-`(df_vote %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
df_vote str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
rename(unit_housing = 3, vote_all = 4, vote_m = 5, vote_f = 6) %>%
filter(`geographic-area-name` !="Puerto Rico") %>%
slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
mutate(per_vote_m = vote_m/vote_all, per_vote_f = vote_f/vote_all)
%>% head(1) df_vote
#> # A tibble: 1 x 8
#> id `geographic-area-na… unit_housing vote_all vote_m vote_f per_vote_m
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0400000U… Alabama 2255026 3685074 1749845 1.94e6 0.475
#> # … with 1 more variable: per_vote_f <dbl>
從 df_race
抓出種族相關變數,不用全抓,抓 one race, two or more races, one race - white, one race - asian, one race - black/african american, one race - native, one race - some other.
### your code
<- df_demo %>% select(matches("E")) %>% select(-matches("PE")) %>% select(-(DP05_0001E:DP05_0032E)) %>% select(GEO_ID:DP05_0085E) %>% select(1:5, 7:8, 14, 22, 27)
df_race <- df_race %>% `colnames<-`(df_race %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
df_race str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
rename(race_one = 4, race_two = 5, race_one_white = 6, race_one_black = 7, race_one_asian = 8,
race_one_native_pasific = 9, race_one_other = 10) %>% select(-3) %>%
filter(`geographic-area-name` !="Puerto Rico") %>%
slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
left_join(df_gender %>% select(1:3)) %>% mutate(across(is.numeric, ~(./pop))) %>%
select(-pop)
%>% head(1) df_race
#> # A tibble: 1 x 9
#> id `geographic-area-na… race_one race_two race_one_white race_one_black
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 0400000U… Alabama 0.981 0.0189 0.681 0.266
#> # … with 3 more variables: race_one_asian <dbl>, race_one_native_pasific <dbl>,
#> # race_one_other <dbl>
從 df_social
抓出教育相關變數,不用全抓,抓 25-years-and-over 人口, less-than-9th-grade, 9th-to-12th-grade,-no-diploma, high-school-graduate, some-college,-no-degree, associate’s-degree, bachelor’s-degree, graduate-or-professional-degree.
### your code
<- df_social %>% select(matches("E")) %>% select(-matches("PE")) %>% select(-(DP02_0001E:DP02_0058E)) %>% select(GEO_ID:DP02_0068E)
df_educational_attainment <- df_educational_attainment %>% `colnames<-`(df_educational_attainment %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
df_educational_attainment str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
rename(education = 3, education_js = 4, education_high = 5, education_high_gradu = 6, education_college = 7,
education_associate = 8, education_bachelor = 9, education_gradu_professional = 10) %>% select(-(11:12)) %>%
filter(`geographic-area-name` !="Puerto Rico") %>%
slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
mutate(across(matches("education_"), ~(./education)))
%>% head(1) df_educational_attainment
#> # A tibble: 1 x 10
#> id `geographic-are… education education_js education_high education_high_…
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 04000… Alabama 3320877 0.0431 0.0951 0.308
#> # … with 4 more variables: education_college <dbl>, education_associate <dbl>,
#> # education_bachelor <dbl>, education_gradu_professional <dbl>
把所有資料表串起來後畫圖,你的變數命名可能跟我不一樣!
### your code
<-
df_state_metric %>%
df_gender left_join(df_vote, by = c("id", "geographic-area-name")) %>%
left_join(df_race, by = c("id", "geographic-area-name")) %>%
left_join(df_educational_attainment, by = c("id", "geographic-area-name")) %>%
rename(state=2)
<- df_vote_agg_2020 %>% left_join(df_state_metric, by = "state") %>%
df_presidential_2020 mutate(gop_change = if_else(votes_gop_per > votes_gop_per_2016, "up", "down"),
dem_change = if_else(votes_dem_per > votes_dem_per_2016, "up", "down"))
##### Viz
%>%
df_presidential_2020 ggplot(aes(x = education_high_gradu, y = votes_gop_per, color = gop_change)) +
geom_point() +
theme_bw() +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent)
%>%
df_presidential_2020 ggplot(aes(x = per_m, y = votes_gop_per)) +
geom_point() +
theme_bw() +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent)
%>%
df_presidential_2020 ggplot(aes(x = race_one_asian, y = votes_gop_per)) +
geom_point() +
theme_bw() +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent)