Python 爬虫笔记
Python 爬虫需要用到的一些东西,主要是 urllib、bs4 和 re 库。
urllib 获取数据
简单的请求:
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))
简单的 post 请求:
import urllib.request
import urllib.parse
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(response.read().decode("utf-8"))
超时处理:
try:
response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.01)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("time out")
构造请求头 get:
url = "https://www.douban.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/74.0"
}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
构造请求头 post:
url = "https://httpbin.org/post"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/74.0"
}
data = bytes(urllib.parse.urlencode({"name": "eric"}), encoding="utf-8")
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req)
bs4 解析数据
创建对象:
file = open("./baidu.html", "rb")
html = file.read()
bs = BeautifulSoup(html, "html.parser")
获取 Tag(HTML 标签)内容:
# 获取所有内容
print(bs.title)
print(bs.head)
print(bs.a)
# 获取 Tag 属性
print(bs.head.name)
print(bs.a.attrs)
print(bs.a["class"])
# 操作节点
print(bs.a["class"] = "newclass")
del bs.a["class"]
# 获取 Tag 内部内容
print(bs.title.string)
# BeautifulSoup 是一个特殊的 Tag 对象,表示一个文档的内容
print(bs.name)
print(bs.a.attrs)
搜索文档树:
# 使用字符串
t_list = bs.find_all("a")
print(t_list)
# 使用正则
t_list = bs.find_all(re.compile(".a"))
print(t_list)
# 使用函数
def name_is_exists(tag):
return tag.has_attr("name")
t_list = bs.find_all(name_is_exists)
print(t_list)
# kwargs 参数
t_list = bs.find_all("id"="head")
t_list = bs.find_all(class_=True)
for item in t_list:
print(item)
# text 参数
t_list = bs.find_all(text = "hao123")
t_list = bs.find_all(text = ["hao123", "贴吧"])
for item in t_list:
print(item)
CSS 选择器:
print(bs.select("title"))
print(bs.select(".mnav"))
print(bs.select("#u1"))
print(bs.select("a[class='bri']"))
print(bs.select("head > title")) # 子标签
print(bs.select(".mnav ~ .bri")) # 兄弟标签
re 正则表达式匹配
查找:
print(re.search("asd", "Aasd"))
print(re.findall("a", "shhcasfha"))
print(re.findall("[A-Z]", "SHCHhh"))
替换:
# 用 A 替换 a
print(re.sub("a", "A", "ajsjajjs"))