作業目的: more dplyr

這份作業希望能夠讓你熟悉於利用 dprly 當中 select() 相關的 helper function 與 across()

這次的作業使用美國 Cencus Bureau 提供的普查資料,主要挑選人口變數與社經地位變數,並額外利用 2016 和 2020 總統大選選舉結果,練習不同資料表之間的串接,並查看可能影響選舉結果的變數。

作業: more dplyr

### 這邊不要動
library(tidyverse)

##### import data
df_demo <- read_csv("data/Lab08/DEMOGRAPHIC AND HOUSING ESTIMATES.csv")
df_demo %>% head(1)

df_social <- read_csv("data/Lab08/SOCIAL CHARACTERISTICS.csv")
df_social %>% head(1)

df_vote_agg_2020 <- read_csv("data/Lab08/df_vote_agg_2020.csv")
df_vote_agg_2020 %>% head(1)
#> # A tibble: 1 x 358
#>   GEO_ID NAME   DP05_0001E   DP05_0001M   DP05_0001PE  DP05_0001PM   DP05_0002E 
#>   <chr>  <chr>  <chr>        <chr>        <chr>        <chr>         <chr>      
#> 1 id     Geogr… Estimate!!S… Margin of E… Percent!!SE… Percent Marg… Estimate!!…
#> # … with 351 more variables: DP05_0002M <chr>, DP05_0002PE <chr>,
#> #   DP05_0002PM <chr>, DP05_0003E <chr>, DP05_0003M <chr>, DP05_0003PE <chr>,
#> #   DP05_0003PM <chr>, DP05_0004E <chr>, DP05_0004M <chr>, DP05_0004PE <chr>,
#> #   DP05_0004PM <chr>, DP05_0005E <chr>, DP05_0005M <chr>, DP05_0005PE <chr>,
#> #   DP05_0005PM <chr>, DP05_0006E <chr>, DP05_0006M <chr>, DP05_0006PE <chr>,
#> #   DP05_0006PM <chr>, DP05_0007E <chr>, DP05_0007M <chr>, DP05_0007PE <chr>,
#> #   DP05_0007PM <chr>, DP05_0008E <chr>, DP05_0008M <chr>, DP05_0008PE <chr>,
#> #   DP05_0008PM <chr>, DP05_0009E <chr>, DP05_0009M <chr>, DP05_0009PE <chr>,
#> #   DP05_0009PM <chr>, DP05_0010E <chr>, DP05_0010M <chr>, DP05_0010PE <chr>,
#> #   DP05_0010PM <chr>, DP05_0011E <chr>, DP05_0011M <chr>, DP05_0011PE <chr>,
#> #   DP05_0011PM <chr>, DP05_0012E <chr>, DP05_0012M <chr>, DP05_0012PE <chr>,
#> #   DP05_0012PM <chr>, DP05_0013E <chr>, DP05_0013M <chr>, DP05_0013PE <chr>,
#> #   DP05_0013PM <chr>, DP05_0014E <chr>, DP05_0014M <chr>, DP05_0014PE <chr>,
#> #   DP05_0014PM <chr>, DP05_0015E <chr>, DP05_0015M <chr>, DP05_0015PE <chr>,
#> #   DP05_0015PM <chr>, DP05_0016E <chr>, DP05_0016M <chr>, DP05_0016PE <chr>,
#> #   DP05_0016PM <chr>, DP05_0017E <chr>, DP05_0017M <chr>, DP05_0017PE <chr>,
#> #   DP05_0017PM <chr>, DP05_0018E <chr>, DP05_0018M <chr>, DP05_0018PE <chr>,
#> #   DP05_0018PM <chr>, DP05_0019E <chr>, DP05_0019M <chr>, DP05_0019PE <chr>,
#> #   DP05_0019PM <chr>, DP05_0020E <chr>, DP05_0020M <chr>, DP05_0020PE <chr>,
#> #   DP05_0020PM <chr>, DP05_0021E <chr>, DP05_0021M <chr>, DP05_0021PE <chr>,
#> #   DP05_0021PM <chr>, DP05_0022E <chr>, DP05_0022M <chr>, DP05_0022PE <chr>,
#> #   DP05_0022PM <chr>, DP05_0023E <chr>, DP05_0023M <chr>, DP05_0023PE <chr>,
#> #   DP05_0023PM <chr>, DP05_0024E <chr>, DP05_0024M <chr>, DP05_0024PE <chr>,
#> #   DP05_0024PM <chr>, DP05_0025E <chr>, DP05_0025M <chr>, DP05_0025PE <chr>,
#> #   DP05_0025PM <chr>, DP05_0026E <chr>, DP05_0026M <chr>, DP05_0026PE <chr>,
#> #   DP05_0026PM <chr>, DP05_0027E <chr>, …
#> # A tibble: 1 x 614
#>   GEO_ID NAME   DP02_0001E   DP02_0001M   DP02_0001PE  DP02_0001PM  DP02_0002E  
#>   <chr>  <chr>  <chr>        <chr>        <chr>        <chr>        <chr>       
#> 1 id     Geogr… Estimate!!H… Margin of E… Percent!!HO… Percent Mar… Estimate!!H…
#> # … with 607 more variables: DP02_0002M <chr>, DP02_0002PE <chr>,
#> #   DP02_0002PM <chr>, DP02_0003E <chr>, DP02_0003M <chr>, DP02_0003PE <chr>,
#> #   DP02_0003PM <chr>, DP02_0004E <chr>, DP02_0004M <chr>, DP02_0004PE <chr>,
#> #   DP02_0004PM <chr>, DP02_0005E <chr>, DP02_0005M <chr>, DP02_0005PE <chr>,
#> #   DP02_0005PM <chr>, DP02_0006E <chr>, DP02_0006M <chr>, DP02_0006PE <chr>,
#> #   DP02_0006PM <chr>, DP02_0007E <chr>, DP02_0007M <chr>, DP02_0007PE <chr>,
#> #   DP02_0007PM <chr>, DP02_0008E <chr>, DP02_0008M <chr>, DP02_0008PE <chr>,
#> #   DP02_0008PM <chr>, DP02_0009E <chr>, DP02_0009M <chr>, DP02_0009PE <chr>,
#> #   DP02_0009PM <chr>, DP02_0010E <chr>, DP02_0010M <chr>, DP02_0010PE <chr>,
#> #   DP02_0010PM <chr>, DP02_0011E <chr>, DP02_0011M <chr>, DP02_0011PE <chr>,
#> #   DP02_0011PM <chr>, DP02_0012E <chr>, DP02_0012M <chr>, DP02_0012PE <chr>,
#> #   DP02_0012PM <chr>, DP02_0013E <chr>, DP02_0013M <chr>, DP02_0013PE <chr>,
#> #   DP02_0013PM <chr>, DP02_0014E <chr>, DP02_0014M <chr>, DP02_0014PE <chr>,
#> #   DP02_0014PM <chr>, DP02_0015E <chr>, DP02_0015M <chr>, DP02_0015PE <chr>,
#> #   DP02_0015PM <chr>, DP02_0016E <chr>, DP02_0016M <chr>, DP02_0016PE <chr>,
#> #   DP02_0016PM <chr>, DP02_0017E <chr>, DP02_0017M <chr>, DP02_0017PE <chr>,
#> #   DP02_0017PM <chr>, DP02_0018E <chr>, DP02_0018M <chr>, DP02_0018PE <chr>,
#> #   DP02_0018PM <chr>, DP02_0019E <chr>, DP02_0019M <chr>, DP02_0019PE <chr>,
#> #   DP02_0019PM <chr>, DP02_0020E <chr>, DP02_0020M <chr>, DP02_0020PE <chr>,
#> #   DP02_0020PM <chr>, DP02_0021E <chr>, DP02_0021M <chr>, DP02_0021PE <chr>,
#> #   DP02_0021PM <chr>, DP02_0022E <chr>, DP02_0022M <chr>, DP02_0022PE <chr>,
#> #   DP02_0022PM <chr>, DP02_0023E <chr>, DP02_0023M <chr>, DP02_0023PE <chr>,
#> #   DP02_0023PM <chr>, DP02_0024E <chr>, DP02_0024M <chr>, DP02_0024PE <chr>,
#> #   DP02_0024PM <chr>, DP02_0025E <chr>, DP02_0025M <chr>, DP02_0025PE <chr>,
#> #   DP02_0025PM <chr>, DP02_0026E <chr>, DP02_0026M <chr>, DP02_0026PE <chr>,
#> #   DP02_0026PM <chr>, DP02_0027E <chr>, …
#> # A tibble: 1 x 9
#>   state   votes_gop votes_dem total_votes votes_gop_per votes_dem_per state_abbv
#>   <chr>       <dbl>     <dbl>       <dbl>         <dbl>         <dbl> <chr>     
#> 1 Alabama   1441168    849648     2323304         0.620         0.366 AL        
#> # … with 2 more variables: votes_gop_per_2016 <dbl>, votes_dem_per_2016 <dbl>

1. 幫我抓性別

df_demo 抓出男性人數、女性人數、男性佔比、女性佔比。

### your code
df_gender <- df_demo %>% select(matches("E")) %>% select(-matches("PE")) %>% select(GEO_ID:DP05_0004E)
df_gender <- df_gender %>% `colnames<-`(df_gender %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
                             str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
  rename(pop = 3, pop_m = 4, pop_f= 5) %>% select(-6) %>%
  filter(`geographic-area-name` !="Puerto Rico") %>%
  slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
  mutate(per_m = pop_m/pop, per_f = pop_f/pop)
df_gender %>% head(1)
#> # A tibble: 1 x 7
#>   id          `geographic-area-name`     pop   pop_m   pop_f per_m per_f
#>   <chr>       <chr>                    <dbl>   <dbl>   <dbl> <dbl> <dbl>
#> 1 0400000US01 Alabama                4876250 2359355 2516895 0.484 0.516

2. 幫我抓投票人口

df_demo 抓出投票人口、投票女性人口、投票男性人口、投票女性佔比、投票男性佔比。

### your code
df_vote <- df_demo %>% select(matches("E")) %>% select(-matches("PE")) %>% select(-(DP05_0001E:DP05_0085E))
df_vote <- df_vote %>% `colnames<-`(df_vote %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
                                      str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
  rename(unit_housing = 3, vote_all = 4, vote_m = 5, vote_f = 6) %>%
  filter(`geographic-area-name` !="Puerto Rico") %>%
  slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
  mutate(per_vote_m = vote_m/vote_all, per_vote_f = vote_f/vote_all)
df_vote %>% head(1)
#> # A tibble: 1 x 8
#>   id        `geographic-area-na… unit_housing vote_all  vote_m vote_f per_vote_m
#>   <chr>     <chr>                       <dbl>    <dbl>   <dbl>  <dbl>      <dbl>
#> 1 0400000U… Alabama                   2255026  3685074 1749845 1.94e6      0.475
#> # … with 1 more variable: per_vote_f <dbl>

3. 幫我抓種族人口

df_race 抓出種族相關變數,不用全抓,抓 one race, two or more races, one race - white, one race - asian, one race - black/african american, one race - native, one race - some other.

### your code
df_race <- df_demo %>% select(matches("E")) %>% select(-matches("PE")) %>% select(-(DP05_0001E:DP05_0032E)) %>% select(GEO_ID:DP05_0085E) %>% select(1:5, 7:8, 14, 22, 27)
df_race <- df_race %>% `colnames<-`(df_race %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
                                    str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
  rename(race_one = 4, race_two = 5, race_one_white = 6, race_one_black = 7, race_one_asian = 8,
         race_one_native_pasific = 9, race_one_other = 10) %>% select(-3) %>%
  filter(`geographic-area-name` !="Puerto Rico") %>%
  slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
  left_join(df_gender %>% select(1:3)) %>% mutate(across(is.numeric, ~(./pop))) %>%
  select(-pop)
df_race %>% head(1)
#> # A tibble: 1 x 9
#>   id        `geographic-area-na… race_one race_two race_one_white race_one_black
#>   <chr>     <chr>                   <dbl>    <dbl>          <dbl>          <dbl>
#> 1 0400000U… Alabama                 0.981   0.0189          0.681          0.266
#> # … with 3 more variables: race_one_asian <dbl>, race_one_native_pasific <dbl>,
#> #   race_one_other <dbl>

4. 幫我抓種族人口

df_social 抓出教育相關變數,不用全抓,抓 25-years-and-over 人口, less-than-9th-grade, 9th-to-12th-grade,-no-diploma, high-school-graduate, some-college,-no-degree, associate’s-degree, bachelor’s-degree, graduate-or-professional-degree.

### your code
df_educational_attainment <- df_social %>% select(matches("E")) %>% select(-matches("PE")) %>% select(-(DP02_0001E:DP02_0058E)) %>% select(GEO_ID:DP02_0068E)
df_educational_attainment <- df_educational_attainment %>% `colnames<-`(df_educational_attainment %>% head(1) %>% unlist() %>% unname() %>% str_to_lower() %>% str_remove_all("estimate!!") %>%
                                      str_replace_all(" ", "-") %>% str_replace_all("!!", "_")) %>%
  rename(education = 3, education_js = 4, education_high = 5, education_high_gradu = 6, education_college = 7, 
         education_associate = 8, education_bachelor = 9, education_gradu_professional = 10) %>% select(-(11:12)) %>%
  filter(`geographic-area-name` !="Puerto Rico") %>%
  slice(-1) %>% mutate(across(-matches("id|geographic"), ~as.numeric(.))) %>%
  mutate(across(matches("education_"), ~(./education)))
df_educational_attainment %>% head(1)
#> # A tibble: 1 x 10
#>   id     `geographic-are… education education_js education_high education_high_…
#>   <chr>  <chr>                <dbl>        <dbl>          <dbl>            <dbl>
#> 1 04000… Alabama            3320877       0.0431         0.0951            0.308
#> # … with 4 more variables: education_college <dbl>, education_associate <dbl>,
#> #   education_bachelor <dbl>, education_gradu_professional <dbl>

5. 串起來畫圖

把所有資料表串起來後畫圖,你的變數命名可能跟我不一樣!

### your code
df_state_metric <- 
  df_gender %>% 
  left_join(df_vote, by = c("id", "geographic-area-name")) %>%
  left_join(df_race, by = c("id", "geographic-area-name")) %>% 
  left_join(df_educational_attainment, by = c("id", "geographic-area-name")) %>% 
  rename(state=2)

df_presidential_2020 <- df_vote_agg_2020 %>% left_join(df_state_metric, by = "state") %>%
  mutate(gop_change = if_else(votes_gop_per > votes_gop_per_2016, "up", "down"),
         dem_change = if_else(votes_dem_per > votes_dem_per_2016, "up", "down"))

##### Viz
df_presidential_2020 %>%
  ggplot(aes(x = education_high_gradu, y = votes_gop_per, color = gop_change)) +
  geom_point() +
  theme_bw() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent)

df_presidential_2020 %>%
  ggplot(aes(x = per_m, y = votes_gop_per)) +
  geom_point() +
  theme_bw() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent)

df_presidential_2020 %>%
  ggplot(aes(x = race_one_asian, y = votes_gop_per)) +
  geom_point() +
  theme_bw() +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent)