class: center, middle, inverse, title-slide # Lab08_more-dplyr ## Lab06_dplyr-select-across ### 曾子軒 Dennis Tseng ### 台大新聞所 NTU Journalism ### 2021/04/27 --- <style type="text/css"> .remark-slide-content { padding: 1em 1em 1em 1em; font-size: 28px; } .my-one-page-font { padding: 1em 1em 1em 1em; font-size: 20px; /*xaringan::inf_mr()*/ } </style> # 今日重點 - 分組 - AS06 Preview - dplyr: more - Lab08 Practice --- class: inverse, center, middle # 分組 --- class: inverse, center, middle # [AS06](https://p4css.github.io/R4CSS_TA/AS06_Visualizing-Text-Data.html) --- # dplyr 的未竟之業 - select() - 懶人的福音,幫助你快速選 column - operator: `:`, `!`, `&`, `|`, `c()` - selection helpers: - specific columns: `everything()`, `last_col()` - matching patterns: `starts_with()`, `ends_with()`, `contains()`, `matches()`, `num_range()` - character vector: `all_of()`, `any_of()` - 搭配 function: `where()` --- ```r library(tidyverse) df_marriage <- read_csv("data/Lab04/109Q4_county_marriage.csv") %>% mutate(across(where(is.character), ~iconv(.,from = "BIG5", to = "UTF8"))) %>% slice(-1) %>% mutate(across(matches("MARRY"), ~as.integer(.))) %>% `colnames<-`(str_to_lower(colnames(.))) head(df_marriage, 3) ``` ``` ## # A tibble: 3 x 7 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <int> <int> <int> <chr> ## 1 65000 新北市 6529 580 189 391 109Y4S ## 2 63000 臺北市 3856 422 207 215 109Y4S ## 3 68000 桃園市 3986 370 107 263 109Y4S ``` --- # dplyr 的未竟之業 - select() - operator: `:`, `!`, `&`, `|`, `c()` ```r df_marriage %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <int> <int> <int> <chr> ## 1 65000 新北市 6529 580 189 391 109Y4S ``` ```r df_marriage %>% select(county:marry_cnt) %>% slice(1) ``` ``` ## # A tibble: 1 x 3 ## county marry_cp_cnt marry_cnt ## <chr> <int> <int> ## 1 新北市 6529 580 ``` ```r df_marriage %>% select(1:2, 4) %>% slice(1) ``` ``` ## # A tibble: 1 x 3 ## county_id county marry_cnt ## <chr> <chr> <int> ## 1 65000 新北市 580 ``` ```r df_marriage %>% select(!marry_cnt) %>% slice(1) ``` ``` ## # A tibble: 1 x 6 ## county_id county marry_cp_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <int> <int> <chr> ## 1 65000 新北市 6529 189 391 109Y4S ``` --- # dplyr 的未竟之業 - select() - specific columns: `everything()`, `last_col()` ```r df_marriage %>% select(info_time, everything()) %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## info_time county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt ## <chr> <chr> <chr> <int> <int> <int> <int> ## 1 109Y4S 65000 新北市 6529 580 189 391 ``` ```r df_marriage %>% select(-county_id, everything(), county_id) %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time county_id ## <chr> <int> <int> <int> <int> <chr> <chr> ## 1 新北市 6529 580 189 391 109Y4S 65000 ``` --- # dplyr 的未竟之業 - select() - specific columns: `everything()`, `last_col()` ```r df_marriage %>% select(last_col()) %>% slice(1) ``` ``` ## # A tibble: 1 x 1 ## info_time ## <chr> ## 1 109Y4S ``` ```r df_marriage %>% select(1:last_col(1)) %>% slice(1) ``` ``` ## # A tibble: 1 x 6 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt ## <chr> <chr> <int> <int> <int> <int> ## 1 65000 新北市 6529 580 189 391 ``` --- # dplyr 的未竟之業 - select() - matching patterns: `starts_with()`, `ends_with()`, `contains()`, `matches()`, `num_range()` ```r df_marriage %>% select(starts_with("marry")) %>% slice(1) ``` ``` ## # A tibble: 1 x 4 ## marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt ## <int> <int> <int> <int> ## 1 6529 580 189 391 ``` ```r df_marriage %>% select(starts_with(c("marry_cp", "county"))) %>% slice(1) ``` ``` ## # A tibble: 1 x 3 ## marry_cp_cnt county_id county ## <int> <chr> <chr> ## 1 6529 65000 新北市 ``` --- # dplyr 的未竟之業 - select() - matching patterns: `starts_with()`, `ends_with()`, `contains()`, `matches()`, `num_range()` ```r df_marriage %>% select(contains("marry")) %>% slice(1) ``` ``` ## # A tibble: 1 x 4 ## marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt ## <int> <int> <int> <int> ## 1 6529 580 189 391 ``` ```r df_marriage %>% select(contains("marry.*cnt")) %>% slice(1) ``` ``` ## # A tibble: 1 x 0 ``` --- # dplyr 的未竟之業 - select() - matching patterns: `starts_with()`, `ends_with()`, `contains()`, `matches()`, `num_range()` - 注意! `matches()` 放正規表示式 ```r df_marriage %>% select(matches("marry.*cnt")) %>% slice(1) ``` ``` ## # A tibble: 1 x 4 ## marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt ## <int> <int> <int> <int> ## 1 6529 580 189 391 ``` --- # dplyr 的未竟之業 - select() - selection helpers: - character vector: `all_of()`, `any_of()` ```r vars <- c("marry_m_cnt", "marry_f_cnt") vars2 <- c("marry_m_cnt", "marry_f_cnt", "divorce_m_cnt", "divorce_f_cnt") df_marriage %>% select(all_of(vars)) %>% slice(1) ``` ``` ## # A tibble: 1 x 2 ## marry_m_cnt marry_f_cnt ## <int> <int> ## 1 189 391 ``` ```r df_marriage %>% select(all_of(vars2)) %>% slice(1) ``` ``` ## Error: Can't subset columns that don't exist. ## x Columns `divorce_m_cnt` and `divorce_f_cnt` don't exist. ``` --- # dplyr 的未竟之業 - select() - selection helpers: - character vector: `all_of()`, `any_of()` ```r df_marriage %>% select(any_of(vars)) %>% slice(1) ``` ``` ## # A tibble: 1 x 2 ## marry_m_cnt marry_f_cnt ## <int> <int> ## 1 189 391 ``` ```r df_marriage %>% select(any_of(vars2)) %>% slice(1) ``` ``` ## # A tibble: 1 x 2 ## marry_m_cnt marry_f_cnt ## <int> <int> ## 1 189 391 ``` --- # dplyr 的未竟之業 - select() - selection helpers: - 搭配 function: `where()` - 通常跟 `across()` 一起使用 --- # dplyr 的未竟之業 - across() - 懶人的福音,幫助你對不同 column 使用 function - Apply a function (or functions) across multiple columns - 動詞裡面放 `across(.cols = everything(), .fns = NULL, ..., .names = NULL)` - 先選你要的欄位,接著指定函數 - 欄位部分可以活用上面的教的 selection 方法,函數可以使用完整的或匿名函數 --- # dplyr 的未竟之業 - across() ```r df_marriage %>% mutate(across(matches("marry_"), ~(./100))) %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> ## 1 65000 新北市 65.3 5.8 1.89 3.91 109Y4S ``` ```r df_marriage %>% summarise(across(where(is.numeric), ~sum(.))) ``` ``` ## # A tibble: 1 x 4 ## marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt ## <int> <int> <int> <int> ## 1 36607 3153 993 2160 ``` --- # Anonymous Function 匿名函數 - function - 平常寫函數 - 但為了方便也可以不要寫完整,一次性使用 - `.` 點點代表前面的變數/資料 - 匿名函數的形式 - `~ function(x){x + 5}` - `~ as.integer(.) + 5` --- # dplyr 的未竟之業 - across() ```r df_marriage %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <int> <int> <int> <chr> ## 1 65000 新北市 6529 580 189 391 109Y4S ``` ```r df_marriage %>% mutate(across(starts_with("county"), ~str_c(., "-bad"))) %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <int> <int> <int> <chr> ## 1 65000-bad 新北市-bad 6529 580 189 391 109Y4S ``` --- # dplyr 的未竟之業 - across() ```r df_marriage %>% mutate(across(matches("marry") & -matches("marry_cp"), ~(./marry_cnt))) %>% slice(1) ``` ``` ## # A tibble: 1 x 7 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <dbl> <dbl> <dbl> <chr> ## 1 65000 新北市 6529 1 189 391 109Y4S ``` ```r df_marriage %>% select(matches("marry_.*_cnt"), -starts_with("county"), matches("marry")) %>% mutate(across(matches("marry") & -matches("marry_cp"), ~(./marry_cnt))) %>% slice(1) ``` ``` ## # A tibble: 1 x 4 ## marry_cp_cnt marry_m_cnt marry_f_cnt marry_cnt ## <int> <dbl> <dbl> <dbl> ## 1 6529 0.326 0.674 1 ``` --- # dplyr 的未竟之業 - across() ```r df_marriage <- read_csv("data/Lab04/109Q4_county_marriage.csv") %>% mutate(across(where(is.character), ~iconv(.,from = "BIG5", to = "UTF8"))) %>% slice(-1) %>% mutate(across(matches("MARRY"), ~as.integer(.))) %>% `colnames<-`(str_to_lower(colnames(.))) ``` --- # dplyr 的未竟之業 - rowwise(), c_across() ```r df_marriage %>% rowwise() %>% mutate( marry_sum = sum(c_across(marry_cp_cnt:marry_f_cnt)), marry_mean = mean(c_across(marry_cp_cnt:marry_f_cnt)) ) %>% ungroup() ``` ``` ## # A tibble: 22 x 9 ## county_id county marry_cp_cnt marry_cnt marry_m_cnt marry_f_cnt info_time ## <chr> <chr> <int> <int> <int> <int> <chr> ## 1 65000 新北市 6529 580 189 391 109Y4S ## 2 63000 臺北市 3856 422 207 215 109Y4S ## 3 68000 桃園市 3986 370 107 263 109Y4S ## 4 66000 臺中市 4837 381 130 251 109Y4S ## 5 67000 臺南市 2706 185 57 128 109Y4S ## 6 64000 高雄市 4139 343 112 231 109Y4S ## 7 10002 宜蘭縣 725 62 11 51 109Y4S ## 8 10004 新竹縣 1003 71 13 58 109Y4S ## 9 10005 苗栗縣 859 75 19 56 109Y4S ## 10 10007 彰化縣 1882 161 32 129 109Y4S ## # … with 12 more rows, and 2 more variables: marry_sum <int>, marry_mean <dbl> ``` --- class: inverse, center, middle # [Lab08](https://p4css.github.io/R4CSS_TA/Lab08_Homework_more-dplyr.html)