Fork me on GitHub

python爬虫

错误

1、使用 urllib 模块的 request.urlopen 报错

简单的爬虫

1
2
3
4
5
6
#usr/bin/python
#-*-coding:UTF-8-*-
from urllib import request
response = request.urlopen('https://movie.douban.com/')
content = response.read().decode('utf-8')
print(content)

报错

mac python 3.7

解决

Mac 二进制版本忽略了对 SSL 证书的验证
手动 ssl 证书验证

1
2
3
4
5
6

from urllib import request
import ssl
context = ssl._create_unverified_context()
response = request.urlopen('https://movie.douban.com/',context=context)
content = response.read().decode('utf-8')

爬虫实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from urllib import request
# from mysqlclient import *
import re
import ssl

context = ssl._create_unverified_context()

class MovieTop(object):
def __init__(self):
self.start = 0
self.param = '&filter='
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)'}
self.movie_list = []
self.file_path = './test.txt'

def get_page(self):
try:
url='https://movie.douban.com/top250?start='+str(self.start)+str(self.param)
req = request.Request(url, headers = self.headers)
response = request.urlopen(req, context=context)
page = response.read().decode('utf-8')
page_num = self.start/25 + 1
print('正在抓取第'+str(page_num)+'页数据。。。')
self.start += 25
return page
except request.URLError as e:
if hasattr(e, 'reason'):
print('抓取失败,失败原因:'+e.reason)

def get_movie_info(self):
pattern = re.compile( u'<em.*?class="">(.*?)</em>.*?'
+ u'<span.*?class="title">(.*?)</span>.*?'
+ u'<span.*?class="title">&nbsp;/&nbsp;(.*?)</span>.*?'
+ u'<span.*?class="other">&nbsp;/&nbsp;(.*?)</span>.*?'
+ u'<p.*?class="">.*?导演:(.*?)&nbsp;&nbsp;&nbsp;主演:(.*?)...<br>'
+ u'(.*?)&nbsp;/&nbsp;(.*?)&nbsp;/&nbsp;(.*?)</p>.*?'
+ u'<span.*?class="rating_num".*?property="v:average">(.*?)</span>.*?'
+ u'<span>(.*?)人评价</span>.*?'
+ u'<span.*?class="inq">(.*?)</span>.*?</p>.*?',re.S)

while self.start <= 225:
page = self.get_page()
movies = re.findall(pattern, page)
for movie in movies:
self.movie_list.append([
movie[0],
movie[1],
movie[2],
movie[3],
movie[4].strip(),
movie[5].strip().rstrip('/'),
movie[6].lstrip(),
movie[7].lstrip(),
movie[8].strip(),
movie[9],
movie[10],
movie[11]])


def write_text(self):
print('开始向文件写入数据。。。')
file_top = open(self.file_path, 'w', encoding='utf-8')
try:
for movie in self.movie_list:
file_top.write('电影排名:'+movie[0]+'\r\n')
file_top.write('电影名称:' + movie[1] + '\r\n')
file_top.write('外文名称:' + movie[2] + '\r\n')
file_top.write('电影别名:' + movie[3] + '\r\n')
file_top.write('导演姓名:' + movie[4] + '\r\n')
file_top.write('主演名称:' + movie[5] + '\r\n')
file_top.write('上映年份:' + movie[6] + '\r\n')
file_top.write('制作国家/地区:' + movie[7] + '\r\n')
file_top.write('电影类别:' + movie[8] + '\r\n')
file_top.write('电影评分:' + movie[9] + '\r\n')
file_top.write('参评人数:' + movie[10] + '\r\n')
file_top.write('简短影评:' + movie[11] + '\r\n\n')

except Exception as e:
print(e)
finally:
file_top.close()

def main(self):
print("开始抓取数据")
self.get_movie_info()
self.write_text()
print('数据抓取完毕')

if __name__ == '__main__':
m = MovieTop()
m.main()
-------------本文结束感谢阅读-------------