最近在做一个项目,需要爬取800支股票的股吧评论数据,而且时间范围是2014年4月1日至今。因为数据量实在太大,所以没有选择xpath这个对网页结构要求比较高的方法。 而是选择了用re来匹配文本获得数据。而用re爬虫,需要注意的是网页源代码以及正则匹配式的书写。
获取网址
需要先获取股吧每一页上每一条帖子的网址
因为每一条帖子的网页源码大概如下,其中“a href=”/news,002389,911081289.html” title=”可以买了。其实利好挺多的””里面的网址即是需要爬的帖子的网址的一部分。
<span class="l5 a5">03-10 09:13</span>
</div>
<div class="articleh normal_post">
<span class="l1 a1">342</span>
<span class="l2 a2">0</span>
<span class="l3 a3"><a href="/news,002389,911081289.html" title="可以买了。其实利好挺多的">可以买了。其实利好挺多的</a></span>
<span class="l4 a4"><a href="http://i.eastmoney.com/6390094229493668" data-popper="6390094229493668" data-poptype="1" target="_blank"><font>zhaodong1961</font></a><input type="hidden" value="0" /></span>
先确定匹配规则,pattern = re.compile(‘<div class="articleh normal_post">.?href=”(.?)”.*?</div>’, re.S|re.M),这表示 编译正则表达式,生成一个正则表达式(Pattern)对象,供re.findall使用。而该正则表达式表示找出<div class="articleh normal_post"> ,到下一个</div>之间,href=” “里的网址。
.*?表示进行非贪婪匹配,?表示非贪婪匹配符,要求正则匹配得越少越好。
.匹配除\n以外的任何字符,即匹配一行。行以\n区分。如果不使用re.S,则只在每一行内匹配,如果一行没有,就换下一行重新开始,不会跨行。而使用re.S,正则表达式会将字符串作为一个整体,将\n当做普通字符加入到字符串中,在整体中进行匹配。
找到网址后,需进一步处理,筛除问答帖,并与主网址拼接形成完整的网址。像下面那种悬赏帖就要筛除掉, 而它的源代码如下“span class=”l3 a3”>悬赏<a href=”http://ask.eastmoney.com/detail.html?qid=394572104317014016” title=”【¥180.00】内外因素共振,医疗ETF好戏刚开始?”” 。它比普通帖子多了http,利用这一点就可以筛掉。
def getpost(h):
pattern = re.compile('<div class="articleh normal_post">.*?href="(.*?)".*?</div>', re.S|re.M)
content = re.findall(pattern,h)
s="http://guba.eastmoney.com"
l=[]
for c in content:
if (c.find('http')==-1 and c.find('ask')==-1):
l.append(s+c)
return l
获取评论
与获取网址大同小异,还相对简单一点。
def postcontent(ll,sheet,n,headers):
m=1
k=len(ll)
b=['2014-03-31 24:00:00']
#b=['2020-02-21 24:00:00']
#print(k)
p = re.compile('.*post_content":"(.*?)","post_abstract.*')
p1 = re.compile('.*"post_title":"(.*?)","post_content.*')
p3=re.compile('.*post_publish_time":"(.*?)","post_last_time.*')
for lll in ll:
try:
l = (requests.get(lll,headers=headers, timeout=5)).text
except Exception:
print('e1')
print(lll)
e1.append(lll)
pass
#print('c3')
try:
#print('g')
content = re.findall(p,l)
title = re.findall(p1,l)
time=re.findall(p3,l)
#print('g1')
sheet.cell(row=m+k*n, column=1, value=time[0])
sheet.cell(row=m+k*n, column=2, value=title[0])
sheet.cell(row=m+k*n, column=3, value=content[0])
except Exception:
print('e2')
print(lll)
e2.append(lll)
pass
m=m+1
workbook.save(r'C:\Users\lokiii\Desktop\002416.xlsx')
try:
if time<b:
print(time)
print(b)
print('end')
return 1
break
except Exception:
print(lll)
print('error1')
e3.append(lll)
pass
完整代码
import requests
import re
from lxml import etree
import xlwt
import time
from bs4 import BeautifulSoup
import random
import openpyxl
# 操作excel
workbook = openpyxl.Workbook() #创建excel
sheet = workbook.create_sheet(index=0, title="test")
e1=[]
e2=[]
e3=[]
e=[]
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
def getpost(h):
pattern = re.compile('<div class="articleh normal_post">.*?href="(.*?)".*?</div>', re.S|re.M)
content = re.findall(pattern,h)
s="http://guba.eastmoney.com"
l=[]
for c in content:
if (c.find('http')==-1 and c.find('ask')==-1):
l.append(s+c)
#l= [s+c for c in content]
#p2 = re.compile('<div class="articleh normal_post">.*?title="(.*?)".*?</div>', re.S|re.M)
#t = re.findall(p2,h)
#print(t[8])
return l
def postcontent(ll,sheet,n,headers):
m=1
k=len(ll)
b=['2014-03-31 24:00:00']
#b=['2020-02-21 24:00:00']
#print(k)
p = re.compile('.*post_content":"(.*?)","post_abstract.*')
p1 = re.compile('.*"post_title":"(.*?)","post_content.*')
p3=re.compile('.*post_publish_time":"(.*?)","post_last_time.*')
for lll in ll:
#print('c1')
try:
#l = (requests.get(lll,headers=headers, proxies=proxies,timeout=5)).text
l = (requests.get(lll,headers=headers, timeout=5)).text
#l = (requests.get(lll,proxies=proxies,timeout=5)).text
except Exception:
print('e1')
print(lll)
e1.append(lll)
pass
#print('c3')
try:
#print('g')
content = re.findall(p,l)
title = re.findall(p1,l)
time=re.findall(p3,l)
#print('g1')
sheet.cell(row=m+k*n, column=1, value=time[0])
sheet.cell(row=m+k*n, column=2, value=title[0])
sheet.cell(row=m+k*n, column=3, value=content[0])
#sheet.write(m+k*n,0,time)
#sheet.write(m+k*n,1,title)
#print('g2')
#sheet.write(m+k*n,2,content)
#print('g3')
except Exception:
print('e2')
print(lll)
e2.append(lll)
pass
m=m+1
workbook.save(r'C:\Users\lokiii\Desktop\002416.xlsx')
try:
if time<b:
print(time)
print(b)
print('end')
return 1
break
except Exception:
print(lll)
print('error1')
e3.append(lll)
pass
#workbook=xlwt.Workbook()#创建一个新的工作簿
#sheet=workbook.add_sheet("002376")
st = time.time()
print(st)
s = "http://guba.eastmoney.com/list,002416,f_"
#s = "http://guba.eastmoney.com/list,002419_"
k = ".html"
r= list(range(322,648,1))
ll = [s+str(r)+k for r in r]
#hlist = [(requests.get(u)).text for u in ll]
n=0
en=time.time()
print(en)
#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
# 'X-Forwarded-For' : '9.9.9.9','Forwarded': '9.9.9.9'}
#headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",'Accept-Encoding':'gzip, deflate','Accept-Language':'zh-CN,zh;q=0.9','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Upgrade-Insecure-Requests':'1'}
#headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
start = time.time()
print(start)
for hh in ll:
time.sleep(random.random()) # 暂停0~1秒,时间区间:[0,1)
try:
#h = requests.get(hh,headers=headers, proxies=proxies,timeout=5).text
#h = requests.get(hh, proxies=proxies,timeout=5).text
try:
h = requests.get(hh,headers=headers, timeout=5).text
except:
print('wrong')
h = requests.get(hh,headers=headers, timeout=5).text
#print('f1')
l = getpost(h)
#print('f2')
print(n)
k=postcontent(l,sheet,n,headers)
#print('k:',k)
if k ==1:
break
except:
print(hh)
print('error2')
e.append(hh)
n=n+1
end=time.time()
print('finish:',end-start)
print('e1:\n',e1)
print('\ne2:\n',e2)
print('\ne3:\n',e3)
print('\ne:\n',e)
workbook.save(r'C:\Users\lokiii\Desktop\002416.xlsx')