AS08_Web-Scraping-HTML

題目說明

這份作業希望能夠讓你熟悉 Web Scraping 的流程。本次作業的案例為蘋果基金會的捐款平台。對於這樣的捐款平台，就資料新聞的角度來說，我們可以去看哪些人是常見的捐款者？甚至用文字探勘的方法去了解哪些類型的案子是常見的捐款案？就資料科學的角度來說，我們可以抽取出捐款新聞文字描述、記者姓名、捐款類別等來預估最後捐款的總數。

本作業是上述案例的第一步驟，就是獲取資料。請盡可能獲取所有蘋果基金會捐款案的近10頁資料。並且應可獲得每一筆捐款案的以下欄位，前為tibble的指定名稱，括號內為指定變數名稱。

df_case_list，包含捐款案id(case_id)、新聞標題(title)、捐款案時間(date)、捐款案狀態(status)、捐款總額(amount)、新聞連結(link)、捐款明細連結(link_detail)
df_case_donation，包含捐款案id(case_id)、捐款明細連結(link_detail)、捐款者(donator)、捐款金額(donation)、捐款時間(donate_date)
df_case_news，包含新聞連結(link)、新聞內容(text)、新聞發布時間(time)、記者或攝影師(reporter)、首圖圖說(caption)、配圖數量(n_figure)(這很難抓可以放棄)

作業要求

請去掉下方程式碼的註解(檢查部分) 以驗證你所抓到的資料未有重複
用橫向 bar chart 印出捐款次數最多的前十大單位或個體（注意，個體可能是以全形或半形逗號分隔。先不處理填錯名字的問題、也暫時不考慮如「XXX全家」的問題）
請用 glimpse() 分別呈現上述 tibble 的長相

計分方式

上述作業要求的驗證與 bar chart
df_case_list 爬蟲過程與內容，包含捐款案id(case_id)、新聞標題(title)、捐款案時間(date)、捐款案狀態(status)、捐款總額(amount)、新聞連結(link)、捐款明細連結(link_detail)
df_case_donation 爬蟲過程與內容，包含捐款案id(case_id)、捐款明細連結(link_detail)、捐款者(donator)、捐款金額(donation)、捐款時間(donate_date)
df_case_news 爬蟲過程與內容，包含新聞連結(link)、新聞內容(text)
bonus: df_case_news 抓到的欄位越多加分越多，你可以多抓新聞標題(title)、新聞發布時間(time)、記者或攝影師(reporter)、首圖配字(caption)、圖片數量(n_figure)，並 join df_case_list and df_case_news 再呈現結果

再次重申：圖片數量很難抓，不要花太多時間在上面，我也抓不到！

作答區 - 爬蟲程式碼

### your code
library(tidyverse)
# library(httr)
# library(rvest)
# 
# p_read_html <- possibly(read_html, otherwise = NULL)
# 
# ##### 抓案例
# df_case_list <- tibble()
# for(i in 1:10){
#   url <- str_c("https://tw.feature.appledaily.com/charity/projlist/", i)
#   html <- url %>% curl::curl(handle = curl::new_handle("useragent" = "Mozilla/5.0")) %>% p_read_html()
#   table <- html %>% html_table()
#   df_table <- 
#     table[[1]] %>% as_tibble() %>% slice(-1) %>% `colnames<-`(c("case_id", "title", "date", "status", "amount", "detail"))
#   
#   link <- html %>% html_nodes(".artcatdetails") %>% html_attr("href")
#   link_detail <- html %>% html_nodes(".details") %>% html_attr("href")
#   df_case_list_tmp <- df_table %>% bind_cols(tibble(link = link, link_detail = link_detail))
#   
#   df_case_list <- df_case_list %>% bind_rows(df_case_list_tmp)
#   print(i)
#   Sys.sleep(20)
# }
# 
# df_case_list
# df_case_list %>% distinct(case_id)
# df_case_list %>% write_rds("data/AS08/df_case_list.rds")
# df_case_list %>% write_csv("data/AS08/df_case_list.csv")
# 
# ##### 抓捐款清單
# df_case_donation <- tibble()
# index_now <- 1
# index_length <- ceiling(dim(df_case_list)[1]/10)
# 
# for(i in index_now:index_length){
#   j = i*10-9
#   k = j+9
#   url = df_case_list %>% select(link_detail) %>% slice(j:k) %>% pull() 
#   
#   html = url %>% map(function(x){x %>% curl::curl(handle = curl::new_handle("useragent" = "Mozilla/5.0")) %>% p_read_html()}) %>% 
#     set_names(url) %>% compact()
#   html_index <- html %>% 
#     map(function(x){x %>% html_nodes(".addellis-f") %>% html_text() %>% '['(1)}) %>%
#     map_lgl(function(x){!is.na(x)})
#   
#   html_f <- html[html_index]
#   
#   if(length(html_f)==0) {index_now = index_now + 1;print(str_c("all links are dead: article ",j, " to article ", k));next}
#   
#   table_raw <- html_f %>% map(function(x){x %>% html_table()})
#   df_case_donation_tmp <- table_raw %>% map(function(x){x %>% '[['(2) %>% slice(-1)}) %>% bind_rows(.id = "link_detail") %>% as_tibble() %>%
#     `colnames<-`(c("link_detail","donator_order", "donator", "donation", "donate_date"))
#     
#   df_case_donation <- df_case_donation %>% bind_rows(df_case_donation_tmp)
#   
#   print(str_c("finished article ", j, " to article ", k))
#   index_now = index_now + 1
#   Sys.sleep(20)
#   closeAllConnections()
#   gc()
# }
# 
# df_case_donation
# df_case_donation %>% distinct(link_detail)
# df_case_donation %>% count(link_detail)
# df_case_donation %>% 
#   mutate(case_id = str_remove(link_detail, "https://tw.feature.appledaily.com/charity/projdetail/")) %>%
#   write_rds("data/AS08/df_case_donation.rds")
# df_case_donation %>% 
#   mutate(case_id = str_remove(link_detail, "https://tw.feature.appledaily.com/charity/projdetail/")) %>% 
#   write_csv("data/AS08/df_case_donation.csv")
# 
# 
# ##### 抓新聞內容
# df_case_news <- tibble()
# index_now <- 1
# index_length <- ceiling(dim(df_case_list)[1]/10)
# 
# for(i in index_now:index_length){
#   j = i*10-9
#   k = j+9
#   i=1
#   url = df_case_list %>% select(link) %>% slice(j:k) %>% pull() 
#   
#   html = url %>% map(function(x){x %>% p_read_html()}) %>% 
#     set_names(url) %>% compact()
#   html_index <- html %>% 
#     map(function(x){x %>% html_nodes(".text_medium") %>% html_text() %>% '['(1)}) %>%
#     map_lgl(function(x){!is.na(x)})
#   
#   html_f <- html[html_index]
#   
#   if(length(html_f)==0) {index_now = index_now + 1;print(str_c("all links are dead: article ",j, " to article ", k));next}
#   
#   page_title    <- html_f %>% map(function(x){x %>% html_nodes(".text_medium") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_text    <- html_f %>% map(function(x){x %>% html_nodes(".tw-max_width") %>% html_text() %>% str_c(collapse = "::::::")}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_reporter    <- html_f %>% map(function(x){x %>% html_nodes(".tw-max_width:nth-child(5)") %>% html_text() %>% `[`(1)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_time    <- html_f %>% map(function(x){x %>% html_nodes(".timestamp") %>% html_text() %>% `[`(1)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_caption    <- html_f %>% map(function(x){x %>% html_nodes(".box--margin-top-lg") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   
#   df_case_news_tmp <- tibble(link = names(page_title), text = unlist(page_text), reporter = unlist(page_reporter),
#                              title = unlist(page_title), time = unlist(page_time), caption = unlist(page_caption))
#   
#   df_case_news <- df_case_news %>% bind_rows(df_case_news_tmp)
#   print(str_c("finished article ", j, " to article ", k))
#   index_now = index_now + 1
#   Sys.sleep(20)
#   closeAllConnections()
#   gc()
# }
# 
# df_case_news
# df_case_news %>% distinct(link)
# df_case_news %>% count(link)
# df_case_news %>% write_rds("data/AS08/df_case_news.rds")
# df_case_news %>% write_csv("data/AS08/df_case_news.csv")
# 
# df_case_news <- read_csv("data/AS08/df_case_news.csv")
# df_case_donation <- read_csv("data/AS08/df_case_donation.csv")
# df_case_list <- read_csv("data/AS08/df_case_list.csv")

作答區 - 作業要求檢查

這邊的 code 請去掉 comment 後執行喔！可以用來確認結果！

### your code
df_case_list <- read_csv("data/AS08/df_case_list.csv")
df_case_donation <- read_csv("data/AS08/df_case_donation.csv")
df_case_news <- read_csv("data/AS08/df_case_news.csv")

## 檢查
df_case_list %>% summarise(n_distinct(case_id))

#> # A tibble: 1 x 1
#>   `n_distinct(case_id)`
#>                   <int>
#> 1                   200

df_case_list %>% summarise(n_distinct(link))

#> # A tibble: 1 x 1
#>   `n_distinct(link)`
#>                <int>
#> 1                200

df_case_list %>% summarise(n_distinct(link_detail))

#> # A tibble: 1 x 1
#>   `n_distinct(link_detail)`
#>                       <int>
#> 1                       200

df_case_donation %>% summarise(n_distinct(link_detail))

#> # A tibble: 1 x 1
#>   `n_distinct(link_detail)`
#>                       <int>
#> 1                       200

df_case_news %>% summarise(n_distinct(link))

#> # A tibble: 1 x 1
#>   `n_distinct(link)`
#>                <int>
#> 1                199

## bar chart
df_donator_top <- df_case_donation %>% 
  mutate(donator = str_split(donator, ",|，|、")) %>%
  unnest(c(donator)) %>% 
  mutate(donator = if_else(str_detect(donator, "善心|匿名|不具名|無"), "匿名", donator)) %>%
  count(donator, sort = T) %>% head(20)

df_donator_top %>% slice(1:10) %>% mutate(donator = fct_reorder(donator, n)) %>%
  ggplot(aes(donator, n)) + geom_col() +
  coord_flip() +
  theme_bw() +
  guides(fill = FALSE) +
  labs(x= "捐款人",y= "捐款次數", title = "捐款芳名錄與捐款次數", caption = "資料：蘋果日報基金會近兩百項案件") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(text = element_text(family = "Noto Sans CJK TC Medium"))

## 看長相
df_case_list %>% glimpse()

#> Rows: 200
#> Columns: 8
#> $ case_id     <chr> "A5131", "A5130", "A5129", "A5128", "A5127", "A5126", "A51…
#> $ title       <chr> "大三男罹血癌　單親母嘆夫已癌逝「老天殘忍」", "婦咬牙作工兼顧癌夫　「為小孩苦往肚裡吞」", "粗工翁腦出血中風…
#> $ date        <date> 2021-05-07, 2021-05-06, 2021-05-05, 2021-05-04, 2021-05-0…
#> $ status      <chr> "未結案", "未結案", "未結案", "未結案", "未結案", "未結案", "未結案", "未結案", "未…
#> $ amount      <dbl> 18365, 122899, 201968, 282653, 498089, 654573, 784644, 809…
#> $ detail      <chr> "明細", "明細", "明細", "明細", "明細", "明細", "明細", "明細", "明細", "明細"…
#> $ link        <chr> "https://tw.appledaily.com/headline/20210507/VRXKZCX76BDTN…
#> $ link_detail <chr> "https://tw.feature.appledaily.com/charity/projdetail/A513…

df_case_donation %>% glimpse()

#> Rows: 278,365
#> Columns: 6
#> $ link_detail   <chr> "https://tw.feature.appledaily.com/charity/projdetail/A5…
#> $ donator_order <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
#> $ donator       <chr> "張力升", "張元銘", "張元瑄", "陳維全", "劉麗珍", "張凱名", "李鎂鷹", "藍予宏", …
#> $ donation      <dbl> 10000, 1500, 1500, 1000, 1000, 1000, 500, 500, 300, 300,…
#> $ donate_date   <date> 2021-05-06, 2021-05-07, 2021-05-07, 2021-05-07, 2021-05…
#> $ case_id       <chr> "A5131", "A5131", "A5131", "A5131", "A5131", "A5131", "A…

df_case_news %>% glimpse()

#> Rows: 199
#> Columns: 6
#> $ link     <chr> "https://tw.appledaily.com/headline/20210507/VRXKZCX76BDTNK7Y…
#> $ text     <chr> "「先生是在6年前因口腔癌走的，沒想到今年1月小兒子小文又罹血癌，老天對我很殘忍。」45歲單親媽阿慧（黃之慧）提到22歲、…
#> $ reporter <chr> "報導．攝影／韓旭爾", "報導‧攝影∕張嘉恬", "報導‧攝影／向高彬", "報導．攝影／韓旭爾", "報導‧攝影／向高…
#> $ title    <chr> "大三男罹血癌　單親母嘆夫已癌逝「老天殘忍」", "婦咬牙作工兼顧癌夫　「為小孩苦往肚裡吞」", "粗工翁腦出血中風倒　「…
#> $ time     <chr> "更新時間： 2021/05/07 03:00", "更新時間： 2021/05/06 03:00", "更新時間： 20…
#> $ caption  <chr> "小文（中）罹癌後身體虛弱，媽媽阿慧（左）幫他沖泡營養品補充體力。右為阿慧83歲婆婆。", "阿鴻罹癌無力工作，妻子阿莉（…

## 加分題
df_case_list %>% left_join(df_case_news) %>% count(is.na(reporter))

#> # A tibble: 2 x 2
#>   `is.na(reporter)`     n
#> * <lgl>             <int>
#> 1 FALSE               190
#> 2 TRUE                 10

df_case_list %>% left_join(df_case_news) %>% count(is.na(time))

#> # A tibble: 2 x 2
#>   `is.na(time)`     n
#> * <lgl>         <int>
#> 1 FALSE           197
#> 2 TRUE              3

df_case_list %>% left_join(df_case_news) %>%
  select(case_id, title, date, time, reporter, caption) %>% glimpse()

#> Rows: 200
#> Columns: 6
#> $ case_id  <chr> "A5131", "A5130", "A5129", "A5128", "A5127", "A5126", "A5125"…
#> $ title    <chr> "大三男罹血癌　單親母嘆夫已癌逝「老天殘忍」", "婦咬牙作工兼顧癌夫　「為小孩苦往肚裡吞」", "粗工翁腦出血中風倒　「…
#> $ date     <date> 2021-05-07, 2021-05-06, 2021-05-05, 2021-05-04, 2021-05-03, …
#> $ time     <chr> "更新時間： 2021/05/07 03:00", "更新時間： 2021/05/06 03:00", "更新時間： 20…
#> $ reporter <chr> "報導．攝影／韓旭爾", "報導‧攝影∕張嘉恬", "報導‧攝影／向高彬", "報導．攝影／韓旭爾", "報導‧攝影／向高…
#> $ caption  <chr> "小文（中）罹癌後身體虛弱，媽媽阿慧（左）幫他沖泡營養品補充體力。右為阿慧83歲婆婆。", "阿鴻罹癌無力工作，妻子阿莉（…

AS08_Web-Scraping-HTML_ref

曾子軒 Teaching Assistant

2021/05/11

題目說明

作業要求

計分方式

作答區 - 爬蟲程式碼

作答區 - 作業要求檢查