這份作業希望能夠讓你熟悉 Web Scraping 的流程。
請幫我從 page = 1
到 page = 5
,抓取 5 頁嘖嘖的目錄頁面。抓取欄位包含標題(title)、連結(title_link)、日程(day)、提案人(author)、提案人連結(author_link)、類別(cattext),並額外增加一個欄位代表現在的頁面(page)。
library(tidyverse)
# library(rvest)
# library(httr)
# library(clipr)
#
# ### 測試
# url_test = "https://www.zeczec.com/categories?page=100"
# html_test = url_test %>% read_html()
#
# main_title <- html_test %>% html_nodes(".mb0.b") %>% html_text()
# main_title_link <- html_test %>% html_nodes(".h-100 .db") %>% html_attr("href")
# main_money <- html_test %>% html_nodes(".fr") %>% html_text()
# main_day <- html_test %>% html_nodes(".b+ .f7") %>% html_text()
# main_author <- html_test %>% html_nodes(".f7 a") %>% html_text()
# main_author_link <- html_test %>% html_nodes(".f7 a") %>% html_attr("href")
# main_money <- html_test %>% html_nodes(".fr") %>% html_text()
# main_cattext <- html_test %>% html_nodes(".h-100 .db+ .f7") %>% html_text()
#
# loop_now <- 1
#
# df_zec_main <- tibble()
# for (i in loop_now:5) {
# url = str_c("https://www.zeczec.com/categories?page=", loop_now)
# html = url %>% curl::curl(handle = curl::new_handle("useragent" = "Mozilla/5.0")) %>% read_html()
# main_title <- html %>% html_nodes(".mb0.b") %>% html_text()
# main_title_link <- html %>% html_nodes(".h-100 .db") %>% html_attr("href")
# main_money <- html %>% html_nodes(".fr") %>% html_text()
# main_day <- html %>% html_nodes(".b+ .f7") %>% html_text()
# main_author <- html %>% html_nodes(".f7 a") %>% html_text()
# main_author_link <- html %>% html_nodes(".f7 a") %>% html_attr("href")
# main_cattext <- html %>% html_nodes(".h-100 .db+ .f7") %>% html_text()
#
# df_zec_main_tmp <- tibble(title = main_title,
# title_link = main_title_link,
# money = main_money,
# day = main_day,
# author = main_author,
# author_link = main_author_link,
# cattext = main_cattext,
# page = rep(loop_now, 12))
# df_zec_main <- df_zec_main %>% bind_rows(df_zec_main_tmp)
# print(str_c("finished page = ", loop_now))
# loop_now = loop_now + 1
# Sys.sleep(10)
# }
# df_zec_main %>% glimpse()
# df_zec_main %>% write_rds("data/Lab09/df_zec_main_template.rds")
<- read_rds("data/Lab09/df_zec_main_template.rds")
df_zec_main %>% glimpse() df_zec_main
#> Rows: 60
#> Columns: 8
#> $ title <chr> "你,和那些你沒說的。 魏辰哲 個展", "《Lexio Original》一款獨具匠心的韓式麻將|韓國設計", …
#> $ title_link <chr> "/projects/javiswei-solo-exhibition", "/projects/lexio-ori…
#> $ money <chr> "\nNT$33,600\n", "\nNT$27,560\n", "\nNT$173,890\n", "\nNT$…
#> $ day <chr> "timelapse剩下 52 天\n", "timelapse剩下 32 天\n", "timelapse剩下 5…
#> $ author <chr> "Javis wei", "Amodas", "Suzzi", "LaserPecker Taiwan", "Hor…
#> $ author_link <chr> "/users/wei-chen-che", "/users/amodas", "/users/suzzidesgi…
#> $ cattext <chr> "\n藝術 By\nJavis wei\n", "\n遊戲 By\nAmodas\n", "\n設計 By\nSuz…
#> $ page <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2…
承接上提,請幫我抓下上面的所有提案,應該有 60 則。抓取欄位包含連結(title_link)、內文(page_text)、支持者(page_backers)、時程(page_dayrange)、類別文字(page_cattext)、金錢相關(page_money_goal)、專案內容/留言/常見問答數量(page_meta)、專案文字(page_projecttext)、產品文字(page_producttext)。
# p_read_html <- possibly(read_html, otherwise = NULL)
#
# loop_page_complete <- 1
# loop_page_final <- ceiling(dim(df_zec_main)[1]/10)
# df_zec_page <- tibble()
#
# for (i in 1:loop_page_final) {#loop_long
# j=loop_page_complete
# k=j+9
#
# html = df_zec_main[j:k,] %>% pull(title_link) %>% str_c("https://www.zeczec.com", .) %>%
# map(function(x){x %>% curl::curl(handle = curl::new_handle("useragent" = "Mozilla/5.0")) %>% p_read_html()}) %>%
# set_names(pull(df_zec_main[j:k,"title_link"])) %>% compact()
#
# html_index <- html %>%
# map(function(x){x %>% html_nodes(".js-backers-count") %>% html_text() %>% '['(1)}) %>%
# map_lgl(function(x){!is.na(x)})
#
# html_f <- html[html_index]
#
# if(length(html_f)==0) {loop_page_complete = loop_page_complete + 10;print(str_c("all links are dead: ",loop_page_complete-10));next}
#
# ### meta data 包含作者、發文時間等
# page_text <- html_f %>% map(function(x){x %>% html_nodes(".gray.mv3") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# page_backers <- html_f %>% map(function(x){x %>% html_nodes(".js-backers-count") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# page_dayrange <- html_f %>% map(function(x){x %>% html_nodes(".mb2") %>% html_text() %>% `[`(1) %>% as.character()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# page_cattext <- html_f %>% map(function(x){x %>% html_nodes(".mt3 .f6.gray") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# page_money_goal <- html_f %>% map(function(x){x %>% html_nodes(".relative.items-center") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
#
# page_meta <- html_f %>% map(function(x){x %>% html_nodes(".near-black") %>% html_text() %>% str_c(collapse = "::::::")}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# # page_meta_update <- page_meta %>% map(function(x){x %>% `[`(1)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# # page_meta_comments <- html_f %>% map(function(x){x %>% `[`(2)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# # page_meta_qa <- html_f %>% map(function(x){x %>% `[`(3)}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# page_projecttext <- html_f %>% map(function(x){x %>% html_nodes(".ph3.w-70-l") %>% html_text()}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# page_producttext <- html_f %>% map(function(x){x %>% html_nodes(".o-60") %>% html_text() %>% str_c(collapse = "::::::")}) %>% map(function(x){if(length(x)==0) x = "empty" else(x)})
# Sys.sleep(5)
# df_zec_page_tmp <- tibble(title_link = names(page_text), page_text = unlist(page_text),
# page_backers = unlist(page_backers), page_dayrange = unlist(page_dayrange), page_cattext = unlist(page_cattext),
# page_money_goal = unlist(page_money_goal), page_meta = unlist(page_meta),
# page_projecttext = unlist(page_projecttext), page_producttext = unlist(page_producttext))
#
# df_zec_page <- df_zec_page %>% bind_rows(df_zec_page_tmp)
# print(str_c("finished page = ", loop_page_complete))
# loop_page_complete = loop_page_complete + 10
# Sys.sleep(10)
# }
# closeAllConnections()
# gc()
# Sys.sleep(10)
# df_zec_page %>% glimpse()
# df_zec_page %>% write_rds("data/Lab09/df_zec_page_template.rds")
<- read_rds("data/Lab09/df_zec_page_template.rds")
df_zec_page %>% glimpse() df_zec_page
#> Rows: 59
#> Columns: 9
#> $ title_link <chr> "/projects/javiswei-solo-exhibition", "/projects/lexi…
#> $ page_text <chr> "你,強勢的代名詞。每當任何人被帶入,都會像是被硬生生揪到自己面前,然後隨即被一座封閉的圍牆包圍。我迷戀說…
#> $ page_backers <chr> "28", "18", "390", "1022", "11", "198", "1184", "69",…
#> $ page_dayrange <chr> "\n時程\n2021/05/05 20:00 – 2021/06/30 23:59\n", "\n時程\…
#> $ page_cattext <chr> "\n台灣\n\\\n群眾集資\n\\\n藝術\n", "\n海外\n\\\n預購式專案\n\\\n遊戲\…
#> $ page_money_goal <chr> "\n\n\n112%\n\n112%\n\nNT$33,600\n\n目標 NT$30,000\n\n\…
#> $ page_meta <chr> "\n專案內容\n::::::\n留言\n0\n::::::\n常見問答\n3\n", "\n專案內容\n…
#> $ page_projecttext <chr> "\n\n\n\n於國立臺灣師範大學美術系畢業後的首次藝術創作個展。用藝術作品探討\"你\"這個詞彙,並衍…
#> $ page_producttext <chr> "", "\nNT$1,175\n\n\nSOLD OUT\n\n已被贊助\n10\n次\n\n【嘖嘖獨享…