基于Python爬取股吧评论

最近在做一个项目,需要爬取800支股票的股吧评论数据,而且时间范围是2014年4月1日至今。因为数据量实在太大,所以没有选择xpath这个对网页结构要求比较高的方法。 而是选择了用re来匹配文本获得数据。而用re爬虫,需要注意的是网页源代码以及正则匹配式的书写。

获取网址

需要先获取股吧每一页上每一条帖子的网址

因为每一条帖子的网页源码大概如下,其中“a href=”/news,002389,911081289.html” title=”可以买了。其实利好挺多的””里面的网址即是需要爬的帖子的网址的一部分。

                                   <span class="l5 a5">03-10 09:13</span>
                                </div>
                                <div class="articleh normal_post">
                                    <span class="l1 a1">342</span>
                                    <span class="l2 a2">0</span>
                                    <span class="l3 a3"><a href="/news,002389,911081289.html" title="可以买了。其实利好挺多的">可以买了。其实利好挺多的</a></span>
                                        <span class="l4 a4"><a href="http://i.eastmoney.com/6390094229493668"  data-popper="6390094229493668" data-poptype="1" target="_blank"><font>zhaodong1961</font></a><input type="hidden" value="0" /></span>

先确定匹配规则,pattern = re.compile(‘<div class="articleh normal_post">.?href=”(.?)”.*?</div>’, re.S|re.M),这表示 编译正则表达式,生成一个正则表达式(Pattern)对象,供re.findall使用。而该正则表达式表示找出<div class="articleh normal_post"> ,到下一个</div>之间,href=” “里的网址。

.*?表示进行非贪婪匹配,?表示非贪婪匹配符,要求正则匹配得越少越好。

.匹配除\n以外的任何字符,即匹配一行。行以\n区分。如果不使用re.S,则只在每一行内匹配,如果一行没有,就换下一行重新开始,不会跨行。而使用re.S,正则表达式会将字符串作为一个整体,将\n当做普通字符加入到字符串中,在整体中进行匹配。

找到网址后,需进一步处理,筛除问答帖,并与主网址拼接形成完整的网址。像下面那种悬赏帖就要筛除掉, 而它的源代码如下“span class=”l3 a3”>悬赏<a href=”http://ask.eastmoney.com/detail.html?qid=394572104317014016” title=”【¥180.00】内外因素共振,医疗ETF好戏刚开始?”” 。它比普通帖子多了http,利用这一点就可以筛掉。

def getpost(h):
    pattern = re.compile('<div class="articleh normal_post">.*?href="(.*?)".*?</div>', re.S|re.M)
    content = re.findall(pattern,h)
    s="http://guba.eastmoney.com"
    l=[]
    for c in content:
        if (c.find('http')==-1 and c.find('ask')==-1):
            l.append(s+c)
    return l

获取评论

与获取网址大同小异,还相对简单一点。

def postcontent(ll,sheet,n,headers):
    m=1
    k=len(ll)
    b=['2014-03-31 24:00:00']
    #b=['2020-02-21 24:00:00']
    #print(k)
    p = re.compile('.*post_content":"(.*?)","post_abstract.*')
    p1 = re.compile('.*"post_title":"(.*?)","post_content.*')
    p3=re.compile('.*post_publish_time":"(.*?)","post_last_time.*')
    for lll in ll:
        try:            
            l = (requests.get(lll,headers=headers, timeout=5)).text

        except Exception:
            print('e1')
            print(lll)
            e1.append(lll)
            pass
        #print('c3')
        try:
            #print('g')
            content = re.findall(p,l)
            title = re.findall(p1,l)
            time=re.findall(p3,l)
            #print('g1')
            sheet.cell(row=m+k*n, column=1, value=time[0])
            sheet.cell(row=m+k*n, column=2, value=title[0])
            sheet.cell(row=m+k*n, column=3, value=content[0])
        except Exception:
            print('e2')
            print(lll)
            e2.append(lll)
            pass    
        m=m+1
        workbook.save(r'C:\Users\lokiii\Desktop\002416.xlsx')

        try:           
            if time<b:
                print(time)
                print(b)
                print('end')
                return 1
                break                
        except Exception:
            print(lll)
            print('error1')
            e3.append(lll)
            pass         

完整代码

import requests
import re
from lxml import etree
import xlwt
import time
from bs4 import BeautifulSoup
import random
import openpyxl


# 操作excel
workbook = openpyxl.Workbook() #创建excel
sheet = workbook.create_sheet(index=0, title="test")


e1=[]
e2=[]
e3=[]
e=[]


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}


def getpost(h):
    pattern = re.compile('<div class="articleh normal_post">.*?href="(.*?)".*?</div>', re.S|re.M)
    content = re.findall(pattern,h)
    s="http://guba.eastmoney.com"
    l=[]
    for c in content:
        if (c.find('http')==-1 and c.find('ask')==-1):
            l.append(s+c)
    #l= [s+c for c in content]
    #p2 = re.compile('<div class="articleh normal_post">.*?title="(.*?)".*?</div>', re.S|re.M)
    #t = re.findall(p2,h)
    #print(t[8])
    return l

def postcontent(ll,sheet,n,headers):
    m=1
    k=len(ll)
    b=['2014-03-31 24:00:00']
    #b=['2020-02-21 24:00:00']
    #print(k)
    p = re.compile('.*post_content":"(.*?)","post_abstract.*')
    p1 = re.compile('.*"post_title":"(.*?)","post_content.*')
    p3=re.compile('.*post_publish_time":"(.*?)","post_last_time.*')
    for lll in ll:
        
        #print('c1')        

        try:            
            #l = (requests.get(lll,headers=headers, proxies=proxies,timeout=5)).text
            l = (requests.get(lll,headers=headers, timeout=5)).text
            #l = (requests.get(lll,proxies=proxies,timeout=5)).text

        except Exception:
            print('e1')
            print(lll)
            e1.append(lll)
            pass
        #print('c3')
        try:
            #print('g')
            content = re.findall(p,l)
            title = re.findall(p1,l)
            time=re.findall(p3,l)
            #print('g1')
            sheet.cell(row=m+k*n, column=1, value=time[0])
            sheet.cell(row=m+k*n, column=2, value=title[0])
            sheet.cell(row=m+k*n, column=3, value=content[0])
            #sheet.write(m+k*n,0,time)
            #sheet.write(m+k*n,1,title)
            #print('g2')
            #sheet.write(m+k*n,2,content)
            #print('g3')
        except Exception:
            print('e2')
            print(lll)
            e2.append(lll)
            pass    
        m=m+1
        workbook.save(r'C:\Users\lokiii\Desktop\002416.xlsx')

        try:
            
            if time<b:
                print(time)
                print(b)
                print('end')
                return 1
                break                
        except Exception:
            print(lll)
            print('error1')
            e3.append(lll)
            pass            


#workbook=xlwt.Workbook()#创建一个新的工作簿
#sheet=workbook.add_sheet("002376")

st = time.time()
print(st)
s = "http://guba.eastmoney.com/list,002416,f_"
#s = "http://guba.eastmoney.com/list,002419_"
k = ".html"
r= list(range(322,648,1))
ll = [s+str(r)+k for r in r]
#hlist = [(requests.get(u)).text for u in ll]
n=0
en=time.time()
print(en)

#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
#         'X-Forwarded-For' : '9.9.9.9','Forwarded': '9.9.9.9'}

#headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",'Accept-Encoding':'gzip, deflate','Accept-Language':'zh-CN,zh;q=0.9','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Upgrade-Insecure-Requests':'1'}
#headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}

start = time.time()

print(start)
for hh in ll:
    time.sleep(random.random())  # 暂停0~1秒,时间区间:[0,1)
    try:
        #h = requests.get(hh,headers=headers, proxies=proxies,timeout=5).text
        #h = requests.get(hh, proxies=proxies,timeout=5).text
        try:
            h = requests.get(hh,headers=headers, timeout=5).text
        except:
            print('wrong')
            h = requests.get(hh,headers=headers, timeout=5).text
        #print('f1')
        l = getpost(h)
        #print('f2')
        print(n)
        k=postcontent(l,sheet,n,headers)
        #print('k:',k)
        if k ==1:
            break            
    except:
        print(hh)
        print('error2')
        e.append(hh)
    n=n+1
    

end=time.time()    
print('finish:',end-start)
print('e1:\n',e1)
print('\ne2:\n',e2)
print('\ne3:\n',e3)
print('\ne:\n',e)
workbook.save(r'C:\Users\lokiii\Desktop\002416.xlsx')