作業目的: Web Scraping HTML

這份作業希望能夠讓你熟悉 Web Scraping 的流程。

作業: Web Scraping HTML

嘖嘖 - 目錄頁面

請幫我從 page = 1page = 5,抓取 5 頁嘖嘖的目錄頁面。抓取欄位包含標題(title)、連結(title_link)、日程(day)、提案人(author)、提案人連結(author_link)、類別(cattext),並額外增加一個欄位代表現在的頁面(page)。

library(tidyverse)
# library(rvest)
# library(httr)
# library(clipr)
# 
# ### 測試
# url_test = "https://www.zeczec.com/categories?page=100"
# html_test = url_test %>% read_html()
# 
# main_title <- html_test %>% html_nodes(".mb0.b") %>% html_text()
# main_title_link <- html_test %>% html_nodes(".h-100 .db") %>% html_attr("href")
# main_money <- html_test %>% html_nodes(".fr") %>% html_text()
# main_day <- html_test %>% html_nodes(".b+ .f7") %>% html_text()
# main_author <- html_test %>% html_nodes(".f7 a") %>% html_text()
# main_author_link <- html_test %>% html_nodes(".f7 a") %>% html_attr("href")
# main_money <- html_test %>% html_nodes(".fr") %>% html_text()
# main_cattext <- html_test %>% html_nodes(".h-100 .db+ .f7") %>% html_text()
# 
# loop_now <- 1
# 
# df_zec_main <- tibble()
# for (i in loop_now:5) {
#   url = str_c("https://www.zeczec.com/categories?page=", loop_now)
#   html = url %>% curl::curl(handle = curl::new_handle("useragent" = "Mozilla/5.0")) %>% read_html()
#   main_title <- html %>% html_nodes(".mb0.b") %>% html_text()
#   main_title_link <- html %>% html_nodes(".h-100 .db") %>% html_attr("href")
#   main_money <- html %>% html_nodes(".fr") %>% html_text()
#   main_day <- html %>% html_nodes(".b+ .f7") %>% html_text()
#   main_author <- html %>% html_nodes(".f7 a") %>% html_text()
#   main_author_link <- html %>% html_nodes(".f7 a") %>% html_attr("href")
#   main_cattext <- html %>% html_nodes(".h-100 .db+ .f7") %>% html_text()
# 
#   df_zec_main_tmp <- tibble(title = main_title,
#                             title_link = main_title_link,
#                             money = main_money,
#                             day = main_day,
#                             author = main_author,
#                             author_link = main_author_link,
#                             cattext = main_cattext,
#                             page = rep(loop_now, 12))
#   df_zec_main <- df_zec_main %>% bind_rows(df_zec_main_tmp)
#   print(str_c("finished page = ", loop_now))
#   loop_now = loop_now + 1
#   Sys.sleep(10)
# }
# df_zec_main %>% glimpse()
# df_zec_main %>% write_rds("data/Lab09/df_zec_main_template.rds")
df_zec_main <- read_rds("data/Lab09/df_zec_main_template.rds")
df_zec_main %>% glimpse()
#> Rows: 60
#> Columns: 8
#> $ title       <chr> "你,和那些你沒說的。  魏辰哲 個展", "《Lexio Original》一款獨具匠心的韓式麻將|韓國設計", …
#> $ title_link  <chr> "/projects/javiswei-solo-exhibition", "/projects/lexio-ori…
#> $ money       <chr> "\nNT$33,600\n", "\nNT$27,560\n", "\nNT$173,890\n", "\nNT$…
#> $ day         <chr> "timelapse剩下 52 天\n", "timelapse剩下 32 天\n", "timelapse剩下 5…
#> $ author      <chr> "Javis wei", "Amodas", "Suzzi", "LaserPecker Taiwan", "Hor…
#> $ author_link <chr> "/users/wei-chen-che", "/users/amodas", "/users/suzzidesgi…
#> $ cattext     <chr> "\n藝術 By\nJavis wei\n", "\n遊戲 By\nAmodas\n", "\n設計 By\nSuz…
#> $ page        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2…

嘖嘖 - 提案頁面

承接上提,請幫我抓下上面的所有提案,應該有 60 則。抓取欄位包含連結(title_link)、內文(page_text)、支持者(page_backers)、時程(page_dayrange)、類別文字(page_cattext)、金錢相關(page_money_goal)、專案內容/留言/常見問答數量(page_meta)、專案文字(page_projecttext)、產品文字(page_producttext)。

# p_read_html <- possibly(read_html, otherwise = NULL)
# 
# loop_page_complete <- 1
# loop_page_final <- ceiling(dim(df_zec_main)[1]/10)
# df_zec_page <- tibble()
# 
# for (i in 1:loop_page_final) {#loop_long
#   j=loop_page_complete
#   k=j+9
#   
#   html = df_zec_main[j:k,] %>% pull(title_link) %>% str_c("https://www.zeczec.com", .) %>%
#     map(function(x){x %>% curl::curl(handle = curl::new_handle("useragent" = "Mozilla/5.0")) %>% p_read_html()}) %>% 
#     set_names(pull(df_zec_main[j:k,"title_link"])) %>% compact()
#   
#   html_index <- html %>% 
#     map(function(x){x %>% html_nodes(".js-backers-count") %>% html_text() %>% '['(1)}) %>%
#     map_lgl(function(x){!is.na(x)})
#   
#   html_f <- html[html_index]
#   
#   if(length(html_f)==0) {loop_page_complete = loop_page_complete + 10;print(str_c("all links are dead: ",loop_page_complete-10));next}
#   
#   ### meta data 包含作者、發文時間等
#   page_text    <- html_f %>% map(function(x){x %>% html_nodes(".gray.mv3") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_backers <- html_f %>% map(function(x){x %>% html_nodes(".js-backers-count") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_dayrange  <- html_f %>% map(function(x){x %>% html_nodes(".mb2") %>% html_text() %>% `[`(1) %>% as.character()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_cattext   <- html_f %>% map(function(x){x %>% html_nodes(".mt3 .f6.gray") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_money_goal  <- html_f %>% map(function(x){x %>% html_nodes(".relative.items-center") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   
#   page_meta   <- html_f %>% map(function(x){x %>% html_nodes(".near-black") %>% html_text() %>% str_c(collapse = "::::::")}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   # page_meta_update   <- page_meta %>% map(function(x){x %>% `[`(1)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   # page_meta_comments   <- html_f %>% map(function(x){x %>% `[`(2)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   # page_meta_qa   <- html_f %>% map(function(x){x %>% `[`(3)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_projecttext   <- html_f %>% map(function(x){x %>% html_nodes(".ph3.w-70-l") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   page_producttext   <- html_f %>% map(function(x){x %>% html_nodes(".o-60") %>% html_text() %>% str_c(collapse = "::::::")}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#   Sys.sleep(5)
#   df_zec_page_tmp <- tibble(title_link = names(page_text), page_text = unlist(page_text), 
#                             page_backers = unlist(page_backers), page_dayrange = unlist(page_dayrange), page_cattext = unlist(page_cattext), 
#                             page_money_goal = unlist(page_money_goal), page_meta = unlist(page_meta),
#                             page_projecttext = unlist(page_projecttext), page_producttext = unlist(page_producttext))
#   
#   df_zec_page <- df_zec_page %>% bind_rows(df_zec_page_tmp)
#   print(str_c("finished page = ", loop_page_complete))
#   loop_page_complete = loop_page_complete + 10
#   Sys.sleep(10)
# }
# closeAllConnections()
# gc()
# Sys.sleep(10)
# df_zec_page %>% glimpse()
# df_zec_page %>% write_rds("data/Lab09/df_zec_page_template.rds")
df_zec_page <- read_rds("data/Lab09/df_zec_page_template.rds")
df_zec_page %>% glimpse()
#> Rows: 59
#> Columns: 9
#> $ title_link       <chr> "/projects/javiswei-solo-exhibition", "/projects/lexi…
#> $ page_text        <chr> "你,強勢的代名詞。每當任何人被帶入,都會像是被硬生生揪到自己面前,然後隨即被一座封閉的圍牆包圍。我迷戀說…
#> $ page_backers     <chr> "28", "18", "390", "1022", "11", "198", "1184", "69",…
#> $ page_dayrange    <chr> "\n時程\n2021/05/05 20:00 – 2021/06/30 23:59\n", "\n時程\…
#> $ page_cattext     <chr> "\n台灣\n\\\n群眾集資\n\\\n藝術\n", "\n海外\n\\\n預購式專案\n\\\n遊戲\…
#> $ page_money_goal  <chr> "\n\n\n112%\n\n112%\n\nNT$33,600\n\n目標 NT$30,000\n\n\…
#> $ page_meta        <chr> "\n專案內容\n::::::\n留言\n0\n::::::\n常見問答\n3\n", "\n專案內容\n…
#> $ page_projecttext <chr> "\n\n\n\n於國立臺灣師範大學美術系畢業後的首次藝術創作個展。用藝術作品探討\"你\"這個詞彙,並衍…
#> $ page_producttext <chr> "", "\nNT$1,175\n\n\nSOLD OUT\n\n已被贊助\n10\n次\n\n【嘖嘖獨享…