Python 爬虫需要用到的一些东西,主要是 urllib、bs4 和 re 库。

urllib 获取数据

简单的请求:

import urllib.request

response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))

简单的 post 请求:

import urllib.request
import urllib.parse

data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(response.read().decode("utf-8"))

超时处理:

try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.01)
    print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
    print("time out")

构造请求头 get:

url = "https://www.douban.com"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/74.0"
}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

构造请求头 post:

url = "https://httpbin.org/post"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/74.0"
}
data = bytes(urllib.parse.urlencode({"name": "eric"}), encoding="utf-8")

req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req)

bs4 解析数据

创建对象:

file = open("./baidu.html", "rb")
html = file.read()
bs = BeautifulSoup(html, "html.parser")

获取 Tag(HTML 标签)内容:

# 获取所有内容
print(bs.title)
print(bs.head)
print(bs.a)

# 获取 Tag 属性
print(bs.head.name)
print(bs.a.attrs)
print(bs.a["class"])

# 操作节点
print(bs.a["class"] = "newclass")
del bs.a["class"]

# 获取 Tag 内部内容
print(bs.title.string)

# BeautifulSoup 是一个特殊的 Tag 对象,表示一个文档的内容
print(bs.name)
print(bs.a.attrs)

搜索文档树:

# 使用字符串
t_list = bs.find_all("a")
print(t_list)

# 使用正则
t_list = bs.find_all(re.compile(".a"))
print(t_list)

# 使用函数
def name_is_exists(tag):
    return tag.has_attr("name")

t_list = bs.find_all(name_is_exists)
print(t_list)

# kwargs 参数
t_list = bs.find_all("id"="head")
t_list = bs.find_all(class_=True)

for item in t_list:
    print(item)

# text 参数
t_list = bs.find_all(text = "hao123")
t_list = bs.find_all(text = ["hao123", "贴吧"])

for item in t_list:
    print(item)

CSS 选择器:

print(bs.select("title"))
print(bs.select(".mnav"))
print(bs.select("#u1"))
print(bs.select("a[class='bri']"))
print(bs.select("head > title"))  # 子标签
print(bs.select(".mnav ~ .bri"))  # 兄弟标签

re 正则表达式匹配

查找:

print(re.search("asd", "Aasd"))
print(re.findall("a", "shhcasfha"))
print(re.findall("[A-Z]", "SHCHhh"))

替换:

# 用 A 替换 a
print(re.sub("a", "A", "ajsjajjs"))