進修深造 - 爬取多頁的Python語法練習 @ TS的野獸派部落格

# 我的環境是使用Anaconda3

# 範例是博客來書店

# 爬取的關鍵字是"大數據"

# 因為博客來的查詢結果有多個分頁,所以請複製有分頁的結果傳輸

# 博客來的網站取值是用get的方式

-----------------------------------------------------------------------------------------------------------------------------------------------

# 使用request, BeautifulSoup 和 pandas 的套件

import requests
from bs4 import BeautifulSoup
import pandas as pd

for page in range(1,20): #取 1~20頁, 請注意博客來有防爬限制,超過 28頁會禁止訪問,所以要分次取資料或用sleep語法
# 博客來回傳結果url, 切換頁面數的參數宣告為page,並加入到url字串中
urls = "https://search.books.com.tw/search/query/cat/all/key/%E5%A4%A7%E6%95%B8%E6%93%9A/sort/1/page/" + format(str(page)) + "/v/0/"
#print(urls)
# 取回結果放入res變數中

res = requests.get(urls)
#print (res.text)
# 呼叫 BeautifulSoup套件解析html 網頁
soup = BeautifulSoup(res.text,'html.parser')

#print (soup.title.string)
# 檢視和篩選出書名,價格,分類和出版商的Tag
soup.select("img[class='itemcov']")
soup.select("span[class='price']")
soup.select("span[class='cat']")
soup.select("a[rel='mid_publish']")

#爬取書名
books = pd.Series()
for book in soup.select("img[class='itemcov']"):
books = books.append(pd.Series([book['alt']])).reset_index(drop=True) #加到pd.Series

#檢查價格內容，注意：最後有一些是博客來推薦的書籍，非查詢結果
#for price in soup.select("span[class='price']"):
#print(price.select('b')) #list type

#爬取價格
i = 0
prices = pd.Series()
for price in soup.select("span[class='price']"):
if(i<books.size):
if(len(price.select('b'))==1): #只有價格
prices = prices.append(pd.Series([price.select('b')[0].string])).reset_index(drop=True) # .string取tag<b>中的文字內容
elif(len(price.select('b'))==2): #有打折數+價格
prices = prices.append(pd.Series([price.select('b')[1].string])).reset_index(drop=True) # .string取tag<b>中的文字內容
else:
break
i+=1

#爬取出版社
mid_publishs = pd.Series()
for mid_publish in soup.select("a[rel='mid_publish']"):
mid_publishs = mid_publishs.append(pd.Series([mid_publish['title']])).reset_index(drop=True) #加到pd.Series

#爬取書本類型
cats = pd.Series()
for cat in soup.select("span[class='cat']"):
#print(cat.string)
cats = cats.append(pd.Series(cat.string)).reset_index(drop=True) #加到pd.Series

#合併成DataFrame

#df放入每次抓取page的書本資料

#dfs則是整併所有df的資料集合
df = pd.DataFrame({'書名':books, '價格': prices, '出版商':mid_publishs, '類型':cats})
if page == 1 :
dfs = df
else :
dfs = dfs.append(df,ignore_index = True)
#dfs[['書名','價格','出版商','類型']]