Chapter 16 Scraping
16.1 Complete Code
撰寫爬蟲時需要載入許多不同的函式庫,其中包括用於 HTTP 請求的httr
,以及用於解析 JSON 數據的jsonlite
庫是 R 語言中用於發送 HTTP 請求和處理 HTTP 響應的函式庫,它提供了一組簡單易用的函數,可以讓使用者方便地設置 HTTP 請求的各種參數,如 URL、HTTP 方法、HTTP 頭、HTTP 主體等,並處理 HTTP 響應的內容和狀態碼等。jsonlite
庫是 R 語言中用於解析和生成 JSON 數據的函式庫,它提供了fromJSON()
函數,可以將 JSON 字符串轉換為 R 物件,並提供toJSON()
函數,可以將 R 物件轉換為 JSON 字符串。這個函式庫通常用於處理 API 回應數據中的 JSON 格式數據。
all.df <- tibble()
refer_url <- ""
for(p in 1:10){
url <- str_c('',
res <- GET(url, add_headers("referer"=refer_url)) %>%
content("text") %>%
res$data$list$tags <- NULL
res$data$list$link <- NULL
all.df <- bind_rows(all.df, res$data$list)
all.df$jobNo %>% unique %>% length
16.2 Step-by-Step
16.2.1 Get the first pages
函數設置了一個HTTP header,用於識別HTTP請求的來源。
」的HTTP header。這個header的作用是告訴104人力銀行網站,訪問這個頁面的用戶是從哪個網頁轉跳過來的,也就是告訴網站當前HTTP請求的來源。具體來說,這裡設置的「Referer」值為
url1 <- ""
# Assigning the 2nd page data url to url2
url2 <- ""
# Assigning the 3rd page data url to url3
url3 <- ""
# Getting back the url1 data, assigning to result1
res <- GET(url2, config = add_headers("Referer" = ""))
res1 <- content(res, "text") %>% fromJSON()
result2 <- fromJSON(content(GET(url2), "text"))
# Tracing variable result2 and finding the data.frame, assigning to df2
df2 <- res1$data$list
16.2.2 Get the first page by modifying url
# Guessing the 1st page data url to url1
url1 <- ""
# Getting back the 1st page data
url1 <- ""
result1 <- fromJSON(content(GET(url1), "text"))
df1 <- result1$data$list
16.2.4 Drop out hierarchical variables
Preserving numeric or character, dropping list of data.frame by assigning NULL to the variable
16.2.5 Dropping hierarchical variables by dplyr way
# Getting the 1st page data and dropping variable tags and link
# Assigning to df1
df1 <- result1$data$list %>% select(-tags, -link)
# Getting the 2nd page data and dropping variable tags and link
# Assigning to df2
df2 <- result2$data$list %>% select(-tags, -link)
# binding df1 and df2
all.df <- bind_rows(df1, df2)
16.2.6 Finding out the last page number
# Tracing the number of pages in result1
last_page_num <- result1$data$totalPage
# Checking the availability of the last page
# Examining if the last page data available by re-composing URL with paste0()
url.last_page <- paste0("", last_page_num, "&mode=s&jobsource=2018indexpoc")
# Getting back and parsing the last page data
result.last_page <- fromJSON(content(GET(url.last_page), "text"))
16.2.8 combine all data.frame
# The 1st url of the query
url1 <- ""
# Getting back the 1st page data
result1 <- fromJSON(content(GET(url1), "text"))
# Tracing and getting total number of page
last_page_num <- result1$data$totalPage
# Truncating hierarchical variables: link and tags
all.df <- select(result1$data$list, -link, -tags)
# for-loop to getting back data and joining them
for(p in 1:last_page_num){
url <- paste0("", p, "&mode=s&jobsource=2018indexpoc")
result <- fromJSON(content(GET(url), "text"))
temp.df <- select(result$data$list)
all.df <- bind_rows(all.df, temp.df)
print(paste(p, nrow(all.df)))