AS05_Web-Scraping-JSON

作業目的

這份作業希望能夠讓你熟悉 Web Scraping 的流程，這週的重點會著重在 JSON。

A. scrape 104 & compare salary（50分）

接續老師錄製的影片，請到 104 人力銀行，爬取「軟體工程」和「資料科學」兩種職業的搜尋結果，扣除面議的薪資不計，繪製圖表比較這 2 種職業的薪資差異，請自行思考適合的圖表類型與指標。

你的程式碼應該包含 3 個部分：爬資料、清理資料如薪資與年資等欄位、視覺化，有些流程可能需要借助你的主觀判斷，請盡量搭配文字適度解釋你處理資料的過程。

### your code
library(httr)
library(rvest)
library(tidyverse)
library(jsonlite)

### 爬 data science
# df_ds <- tibble()
# for(i in 1:10){
#   url_ds <- str_c("https://www.104.com.tw/jobs/search/list?ro=0&kwop=7&keyword=%E8%B3%87%E6%96%99%E7%A7%91%E5%AD%B8&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&order=15&asc=0&page=", i, "&mode=s&jobsource=2018indexpoc")
#   json_ds <- url_ds %>% GET(add_headers('User-Agent' = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
#                                         'Accept' = 'application/json, text/javascript, */*; q=0.01',
#                                         'Referer' = 'https://www.104.com.tw')) %>% content("text") %>%fromJSON()
#   df_tmp <- json_ds$data$list %>% as_tibble()
# 
#   df_ds <- df_ds %>% bind_rows(df_tmp)
#   print(i)
#   Sys.sleep(10)
# }
# 
# ### 爬 software engineering
# df_se <- tibble()
# for(i in 1:10){
#   url_se <- str_c("https://www.104.com.tw/jobs/search/list?ro=0&kwop=7&keyword=%E8%BB%9F%E9%AB%94%E5%B7%A5%E7%A8%8B&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&order=15&asc=0&page=", i, "&mode=s&jobsource=2018indexpoc")
#   json_se <- url_se %>% GET(add_headers('User-Agent' = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
#                                         'Accept' = 'application/json, text/javascript, */*; q=0.01',
#                                         'Referer' = 'https://www.104.com.tw')) %>% content("text") %>%fromJSON()
#   df_tmp <- json_se$data$list %>% as_tibble()
# 
#   df_se <- df_se %>% bind_rows(df_tmp)
#   print(i)
#   Sys.sleep(10)
# }
# 
# df_ds %>% write_rds("data/AS05/df_ds.rds")
# df_se %>% write_rds("data/AS05/df_se.rds")
df_ds <- read_rds("data/AS05/df_ds.rds")
df_se <- read_rds("data/AS05/df_se.rds")

### 確認有沒有奇怪的東西混在裡面
df_se %>% filter(str_detect(jobNameSnippet, "軟體|工程|[Ss]ofware [Ee]ngineer")|str_detect(description, "軟體|工程|[Ss]ofware [Ee]ngineer")) %>% select(jobNameSnippet) %>% sample_n(10)
df_se %>% filter(str_detect(jobNameSnippet, "軟體|工程|[Ss]ofware [Ee]ngineer")|str_detect(description, "軟體|工程|[Ss]ofware [Ee]ngineer")) %>% select(description) %>% sample_n(10)

df_ds %>% filter(str_detect(jobNameSnippet, "資料科學|資料分析|[Da]ata [Ss]cientist")|str_detect(description, "資料科學|資料分析")) %>% select(jobNameSnippet) %>% sample_n(10)
df_ds %>% filter(str_detect(jobNameSnippet, "資料科學|資料分析|[Da]ata [Ss]cientist")|str_detect(description, "資料科學|資料分析")) %>% select(description) %>% sample_n(10)

### 清理資料
df_se_clean <- df_se %>% filter(str_detect(jobNameSnippet, "軟體|工程|[Ss]ofware [Ee]ngineer")|str_detect(description, "軟體|工程|ata scientist")) %>%
  filter(!str_detect(jobNameSnippet, "QA|軟體測試")) %>%
  select(matches("salary|period")) %>% filter(!str_detect(salaryDesc, "待遇面議")) %>%
  mutate(salaryLow = as.integer(salaryLow), salaryHigh = as.integer(salaryHigh)) %>%
  mutate(salaryLow = if_else(str_detect(salaryDesc, "時薪"), as.integer(salaryLow*8*22), salaryLow), 
         salaryHigh = if_else(str_detect(salaryDesc, "時薪"), as.integer(salaryHigh*8*22), salaryHigh)) %>%
  mutate(salaryLow = if_else(str_detect(salaryDesc, "年薪"), as.integer(salaryLow/12), salaryLow), 
         salaryHigh = if_else(str_detect(salaryDesc, "年薪"), as.integer(salaryHigh/12), salaryHigh)) %>%
  mutate(salaryHigh = if_else(salaryHigh == 9999999, as.integer(salaryLow + 40000), salaryHigh)) %>%
  mutate(salary_mean = (salaryLow+salaryHigh)/2) %>% 
  mutate(periodDesc = if_else(str_detect(periodDesc, "不拘"), "經歷不拘", "要求年資")) %>%
  mutate(type = "軟體工程")

df_ds_clean <- df_ds %>% filter(str_detect(jobNameSnippet, "資料科學|資料分析|[Da]ata [Ss]cientist")|str_detect(description, "資料科學|資料分析")) %>%
  select(matches("salary|period")) %>% filter(!str_detect(salaryDesc, "待遇面議")) %>%
  mutate(salaryLow = as.integer(salaryLow), salaryHigh = as.integer(salaryHigh)) %>%
  mutate(salaryLow = if_else(str_detect(salaryDesc, "時薪"), as.integer(salaryLow*8*22), salaryLow), 
         salaryHigh = if_else(str_detect(salaryDesc, "時薪"), as.integer(salaryHigh*8*22), salaryHigh)) %>%
  mutate(salaryLow = if_else(str_detect(salaryDesc, "年薪"), as.integer(salaryLow/12), salaryLow), 
         salaryHigh = if_else(str_detect(salaryDesc, "年薪"), as.integer(salaryHigh/12), salaryHigh)) %>%
  mutate(salaryHigh = if_else(salaryHigh == 9999999, as.integer(salaryLow + 40000), salaryHigh)) %>%
  mutate(salary_mean = (salaryLow+salaryHigh)/2) %>% 
  mutate(periodDesc = if_else(str_detect(periodDesc, "不拘"), "經歷不拘", "要求年資")) %>%
  mutate(type = "資料科學") 

### 畫圖
df_se_clean %>% bind_rows(df_ds_clean) %>%
  ggplot(aes(x = type, y = salary_mean, fill = type)) + geom_boxplot() +
  coord_flip() +
  facet_wrap(periodDesc ~ ., nrow = 2) +
  scale_y_continuous(labels = scales::number_format(suffix = "k", scale = 1e-3)) +
  theme_bw() +
  guides(fill = FALSE) +
  labs(x= "職缺類型",y= "平均月薪", title = "104人力銀行資料科學與軟體工程職缺的薪資分佈", caption = "資料：各爬取約 200 筆後剔除無關者") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(text = element_text(family = "Noto Sans CJK TC Medium"))

#> # A tibble: 10 × 1
#>    jobNameSnippet                                                               
#>    <chr>                                                                        
#>  1 <em class='b-txt--highlight'>Software</em> <em class='b-txt--highlight'>Engi…
#>  2 <em class='b-txt--highlight'>軟體工程師</em>                                 
#>  3 高級<em class='b-txt--highlight'>軟體工程師</em>                             
#>  4 嵌入式 Linux 系統<em class='b-txt--highlight'>軟體工程師</em> Embedded Linux…
#>  5 <em class='b-txt--highlight'>Software</em> <em class='b-txt--highlight'>Engi…
#>  6 <em class='b-txt--highlight'>軟體工程師</em>                                 
#>  7 <em class='b-txt--highlight'>軟體工程師</em>                                 
#>  8 <em class='b-txt--highlight'>軟體工程師</em>                                 
#>  9 <em class='b-txt--highlight'>軟體工程師</em>                                 
#> 10 <em class='b-txt--highlight'>軟體工程師</em>                                 
#> # A tibble: 10 × 1
#>    description                                                                  
#>    <chr>                                                                        
#>  1 "工作內容：\r\n作為產品工程設計團隊的一員，[[[軟體工程]]]師，須與大家一起驅… 
#>  2 "1. 熟悉現有原廠[[[軟體]]]中的範例並支援客戶使用。\n2. 影像處理[[[軟體]]]使… 
#>  3 "1. 負責[[[軟體]]]編寫，與部門同事共同完成產品開發及測試。\n2. 熟悉Raspberry…
#>  4 "《工作說明》\n主要職責在於開發電力監控系統平台，使客戶端安裝之監控設備持續… 
#>  5 "1.熟悉TCP/IP網路\n2.監控[[[軟體]]]及客戶端應用程式開發設計\n3.熟悉撰寫Java… 
#>  6 "1.  [[[軟體]]]與使用者介面設計\r\n2. 軟硬體控制與整合\r\n3. 自動化/光學檢測…
#>  7 "1.自動化設備程式設計及維護 \n2.設計修改變更作業 \n3.技術文件製作\n4.客戶端… 
#>  8 "1.負責維護行業別專屬套裝化[[[軟體]]]系統。\r\n2.負責客製化[[[軟體]]]之設計… 
#>  9 "我們正在積極尋找具有熱情的[[[軟體工程]]]師，\n我們提供學習時間及發揮空間。\…
#> 10 "1. 公司官網專案開發 \r\n2. 官網與電子商務系統整合開發 \r\n3. 公司官網程式維…
#> # A tibble: 10 × 1
#>    jobNameSnippet                                                               
#>    <chr>                                                                        
#>  1 <em class='b-txt--highlight'>資料科學</em>工程師（Data Science Scala/Java En…
#>  2 AI及數據平台規劃師                                                           
#>  3 【企業管理諮詢服務】<em class='b-txt--highlight'>資料科學</em>家 Data Scient…
#>  4 V行銷<em class='b-txt--highlight'>資料科學</em>家                            
#>  5 台灣奧美集團 Experience - Senior UX Designer 資深使用者經驗設計師            
#>  6 【數據】專案管理師 (數數發中心,DDT)                                          
#>  7 資深人工智慧<em class='b-txt--highlight'>資料科學</em>家_台達研究院(台北)    
#>  8 專任研究助理                                                                 
#>  9 專案管理師 PM/Project Management -數據科技(數數發中心, DDT)                  
#> 10 <em class='b-txt--highlight'>資料科學</em>分析師（Data Science R&amp;D）- <e…
#> # A tibble: 10 × 1
#>    description                                                                  
#>    <chr>                                                                        
#>  1 "展、布局全球，引領世界進入直播新媒體世代。\n\n歡迎對以下工作內容有興趣的 Sr…
#>  2 "入生醫產業、具訊號處理、[[[資料科學]]]需求的產業。\n\n工作內容：\n1. 基礎8… 
#>  3 "1.人工智慧演算法設計與開發  \n2.影像處理演算法設計與開發 \n    至少完成過一…
#>  4 "[我們的團隊介紹與文化特色]\n\n- 我們是集團於中部研發中心的[[[資料科學]]]團… 
#>  5 "地產數據做大數據分析及推測 )\r\n\r\n我們目前要建置一個不動產新創平台，供專… 
#>  6 "* Role Description *\nWe are looking for NLP Data Scientists who are intere…
#>  7 "1. [[[資料]]]分析軟體模組開發、系統整合與測試。 2. 針對[[[資料科學]]]應用主…
#>  8 "必要項目：\n  、對維護高品質代碼有所堅持\n  、對高齡照護產業有興趣\n  、有… 
#>  9 "TenMax 是由一群充滿熱情、活力與專業的數位行銷專家、軟體工程師與[[[資料科學]…
#> 10 "\n2. 你擅長在快速變動的環境下，權衡上層擬定的策略目標，綜合各面向的考量與其…

B. scrape google trend（50分）

(1) scraping（30分）

請到 Google 每日搜尋趨勢，以抓取 JSON 的方式，爬下最近一週的搜尋趨勢結果，這裡的一週沒有規定具體日期，從你做作業當天為基礎即可。

結果部分請加入日期欄位，先印出前 10 個列，接著印出各個日期的筆數（參考程式碼：df %>% count(date)）。

提示： a. API url 的長相類似這樣 - “https://trends.google.com.tw/trends/api/”，後面還有東西，可以去找一下
b. 可以用迴圈寫，並且在迴圈中更新 API url 的日期
c. 直接用 fromJSON() 會出事，因為 Google 很壞，它故意在結果的前 5 個字塞入不相干的東西阻礙你，建議你可以用 str_sub() 抓取正確的字串再轉換成 JSON

### your code
# date_start <- 20220427
# index_now <- 1
# i <- index_now
# df_res <- tibble()
# for (i in index_now:7) {
#   i <- index_now
#   url <- str_c("https://trends.google.com.tw/trends/api/dailytrends?hl=zh-TW&tz=-480&geo=TW&ns=15&ed=", (date_start+1-i))
#   res <- GET(url)
#   raw <- res %>% content("text")
#   res_json <- str_sub(raw, 6, str_length(raw)) %>% fromJSON()
#   # res_json %>% str()
#   df_res_tmp <- res_json$default$trendingSearchesDays$trendingSearches[[1]] %>% as_tibble() %>%
#     mutate(date = date_start+1-i)
#   df_res <- df_res %>% bind_rows(df_res_tmp)
#   message(i)
#   index_now <- index_now + 1
#   Sys.sleep(10)
# }
# df_res %>% write_rds("data/AS05/df_res.rds")
df_res <- read_rds("data/AS05/df_res.rds")
df_res %>% head(10)
df_res %>% count(date)

#> # A tibble: 10 × 7
#>    title$query    $exploreLink    formattedTraffic relatedQueries image$newsUrl 
#>    <chr>          <chr>           <chr>            <list>         <chr>         
#>  1 取消實聯制     /trends/explor… 20萬+            <df [17 × 2]>  https://www.e…
#>  2 推特           /trends/explor… 5萬+             <df [5 × 2]>   https://www.c…
#>  3 軟性封城       /trends/explor… 2萬+             <df [3 × 2]>   https://www.e…
#>  4 實聯制         /trends/explor… 2萬+             <df [0 × 0]>   https://udn.c…
#>  5 林襄           /trends/explor… 2萬+             <df [1 × 2]>   https://tw.ne…
#>  6 確診者隔離天數 /trends/explor… 1萬+             <df [1 × 2]>   https://www.e…
#>  7 桂冠出版社     /trends/explor… 1萬+             <df [0 × 0]>   https://tw.ne…
#>  8 台南市衛生局   /trends/explor… 1萬+             <df [1 × 2]>   https://udn.c…
#>  9 籃網           /trends/explor… 5000+            <df [0 × 0]>   https://udn.c…
#> 10 蔡京京         /trends/explor… 5000+            <df [0 × 0]>   https://www.e…
#> # … with 3 more variables: articles <list>, shareUrl <chr>, date <dbl>
#> # A tibble: 7 × 2
#>       date     n
#>      <dbl> <int>
#> 1 20220421    17
#> 2 20220422    20
#> 3 20220423    20
#> 4 20220424    20
#> 5 20220425    20
#> 6 20220426    20
#> 7 20220427    20

(2) cleaning（20分）

這個資料的原始結構有點複雜，請嘗試列出 id, date, query, formattedTraffic, title, source, url, snippet 等欄位。其中，id 和 date 是自行加入的欄位，其他都是上一小題就能抓到的結果。這題比較難，所以配分低一些。

提示： a. id: 可以利用 mutate(id = row_number())
b. 原始的 dataframe 不是一般常見的 dataframe，有些 column 本身是 list，又被稱為 nested dataframe，處理起來很麻煩，建議可以用 $
c. 會用到 2 次 left_join()

df_res$title %>% as_tibble() %>% mutate(id = as.character(row_number())) %>%
  left_join(df_res$articles %>% bind_rows(.id = "id") %>%
              as_tibble()) %>%
  left_join(df_res %>% select(formattedTraffic, date) %>% mutate(id = as.character(row_number()))) %>%
  select(id, date, query, formattedTraffic, title, source, url, snippet) %>%
  arrange(desc(date))

#> # A tibble: 863 × 8
#>    id        date query  formattedTraffic title     source  url       snippet   
#>    <chr>    <dbl> <chr>  <chr>            <chr>     <chr>   <chr>     <chr>     
#>  1 21    20220427 食藥署 10萬+            快篩實名… ETtoday https://… 指揮中心… 
#>  2 21    20220427 食藥署 10萬+            家用快篩… ELLE …  https://… 在本土Cov…
#>  3 21    20220427 食藥署 10萬+            懶人包／… udn 元… https://… 本土疫情… 
#>  4 21    20220427 食藥署 10萬+            COVID-19… Heho健… https://… 因應確診… 
#>  5 21    20220427 食藥署 10萬+            懶人包／… Yahoo…  https://… 三、販售… 
#>  6 21    20220427 食藥署 10萬+            黑髮漂色… 健康醫… https://… 東方人的… 
#>  7 21    20220427 食藥署 10萬+            快篩實名… 遠見雜… https://… 指揮中心… 
#>  8 22    20220427 陳昱瑋 10萬+            勾惡「陳… 台灣蘋… https://… 網紅連千… 
#>  9 22    20220427 陳昱瑋 10萬+            勾惡幫主… ETtoda… https://… YouTube頻…
#> 10 22    20220427 陳昱瑋 10萬+            遭連千毅… 三立新… https://… 網紅直播… 
#> # … with 853 more rows

AS05_Web-Scraping-JSON_ref

曾子軒 Teaching Assistant

2022/04/28

作業目的

A. scrape 104 & compare salary（50分）

B. scrape google trend（50分）

(1) scraping（30分）

(2) cleaning（20分）