将百度搜索结果中1-10位的块级元素的源码片段下载存入本地,稍后计算首页词数、首页展现量、排名质量分之类的数据。为何要先将源码存入本地?如果只是查询一个网站的排名的话,请求网页、正则匹配对应排名id可以写入一个多线程函数中执行,但若还要更多维度的数据,比如竞争对手的排名情况、计算排名质量分、展现的url类型等等,在写入一个函数内很可能会造成队列错乱,导致假死,所以拆成多个py运行,一个py实现指定的功能。
抓取百度的程序:获取百度1-10位块级元素的源码:
#coding:utf-8 ''' 百度排名查询,代理多线程版本 ''' import StringIO,pycurl,time,random,re,os,csv,urllib from threading import Thread,Lock from Queue import Queue from bs4 import BeautifulSoup as bs csvfile = open('serp_html.csv','wb') #存放关键词和搜索结果页源码的文件 bdjd_dict = {} daili_list = [] #存储代理ip '''随机提取代理ip''' def ip(): for x in open('hege_daili.txt'): x = x.strip() daili_list.append(x) newip = random.choice(daili_list) return newip '''若代理ip超过10次连接失败,则从代理ip文件中删除''' def daili_delete(ip): dailifile = open('daili_beifen.txt','w') for line in open('hege_daili.txt'): line = line.strip() if ip not in line: dailifile.write(line+"\n") os.system("mv daili_beifen.txt hege_daili.txt") #百度节点 bdjd_list = ["www.baidu.com","180.97.33.107","115.239.210.27","180.97.33.108","180.97.33.107","180.97.33.107","180.97.33.108","220.181.111.188","220.181.111.188","180.97.33.107","180.97.33.107","115.239.211.112","180.97.33.108","180.97.33.108","180.97.33.108","180.97.33.108","180.97.33.108","115.239.211.112","180.97.33.108","115.239.211.112","115.239.210.27","180.97.33.108","115.239.211.112","115.239.210.27","180.97.33.108","115.239.210.27","61.135.169.125","115.239.211.112","115.239.210.27","180.97.33.107","180.97.33.107","180.97.33.108","115.239.210.27","180.97.33.107","61.135.169.121","115.239.210.27","61.135.169.121","61.135.169.125","115.239.211.112","115.239.210.27","61.135.169.125","112.80.248.73","61.135.169.121","112.80.248.74","112.80.248.73","61.135.169.125","180.97.33.108","115.239.210.27","61.135.169.125","61.135.169.125","112.80.248.74","112.80.248.74","61.135.169.121","115.239.210.27","61.135.169.125","111.13.100.92","111.13.100.92","111.13.100.91","111.13.100.91","115.239.211.112","111.13.100.92","111.13.100.91","111.13.100.92","115.239.211.112","115.239.210.27","115.239.211.112","115.239.210.27","115.239.210.27","115.239.210.27","115.239.210.27"] #bdjd_list = ["www.baidu.com"] #定义UA def getUA(): uaList = ['Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ] headers = random.choice(uaList) return headers #随机抽取百度节点 def getBDJD(bdjd_str): bdjd_list = bdjd_str.split(',') bdjd = random.choice(bdjd_list) return bdjd ''' 获取百度源码,若要使用代理,则需引入ip参数,即 is_index(url,headers,ip) ,并取消 c.setopt(c.PROXY,ip) #号的注释 ''' def is_index(url,headers): while 1: try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 120) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') #c.setopt(c.PROXY,ip) '''若使用代理,则取消本行注释''' c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 html = c.fp.getvalue() if '="https://verify.baidu.com' in html: print '出验证码,暂停10分钟' time.sleep(1200) continue else: return html except Exception, what: information = '错误信息:%s' % what return str(information) continue #正则提取模块 def search(req,line): text = re.search(req,line) if text: data = text.group(1) else: data = 'no' return data '''需要查询排名的关键词传入url_list,关键词存放在word中,一行一个''' url_list = [] for line in open('word'): word = line.strip() url_list.append(word) '''多线程抓取函数''' class Fetcher: def __init__(self,threads): self.lock = Lock() #线程锁 self.q_req = Queue() #任务队列 self.q_ans = Queue() #完成队列 self.threads = threads for i in range(threads): t = Thread(target=self.threadget) #括号中的是每次线程要执行的任务 t.setDaemon(True) #设置子线程是否随主线程一起结束,必须在start() #之前调用。默认为False t.start() #启动线程 self.running = 0 #设置运行中的线程个数 def __del__(self): #解构时需等待两个队列完成 time.sleep(0.5) self.q_req.join() #Queue等待队列为空后再执行其他操作 self.q_ans.join() #返回还在运行线程的个数,为0时表示全部运行完毕 def taskleft(self): return self.q_req.qsize()+self.q_ans.qsize()+self.running def push(self,req): self.q_req.put(req) def pop(self): return self.q_ans.get() #线程执行的任务,根据req来区分 def threadget(self): while True: line = self.q_req.get() word = line.strip() ''' Lock.lock()操作,使用with可以不用显示调用acquire和release, 这里锁住线程,使得self.running加1表示运行中的线程加1, 如此做防止其他线程修改该值,造成混乱。 with下的语句结束后自动解锁。 ''' with self.lock: self.running += 1 '''构造请求头,header请自行修改''' headers = [ "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control:max-age=0", "Connection:keep-alive", "Cookie:BAIDUID=EC6ED338982C9DE1ED39972F1B4E5530:FG=1; BIDUPSID=EC6ED338982C9DE1ED39972F1B4E5530; PSTM=1434515748; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a01845311744; BDUSS=35KeW10a3pvNXdNMjQyVnhLUHFoYzZUSW9EVUF-ZXE1bUNuTXFFa0hTVU1tYWhWQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwMgVUMDIFVZ; ispeed_lsm=2; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BAIDUVERIFY=4606BC10EB2C2AB6D4B79FFA477238E38C5C6D295106222D2F75D4F8E9C766D5115D55C4E44133ACFE00D4768C6C1507E06B4C7D6221795299EE03FDE7CB2D46430000:1434541766:3f9b93890a7a5ed0; BDRCVFR[ltbVPlNi2ac]=mbxnW11j9Dfmh7GuZR8mvqV; BD_UPN=123253; sug=3; sugstore=1; ORIGIN=0; bdime=20100; H_PS_645EC=4093eiyri7wdn3v3qfviJa%2FFLXYwKxu%2FIF0wtL7d7pZ9maSmOTmtgqORRMlMRo7E; BD_CK_SAM=1; BDSVRTM=59; H_PS_PSSID=14795_1433_14602_14509_14444_14734_10812_12868_14622_14871_12723_14485_14919_14903_11664_13937_13189_10632", "Host:www.baidu.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.10.4", "User-Agent:%s" % getUA() ] bdjd_str = ','.join(bdjd_list) #newip = ip() '''若使用代理,则取消本行注释''' #bdjd = getBDJD(bdjd_str) '''若使用百度节点,则取消本行注释,并在构造url时引入bdjd''' url = 'https://www.baidu.com/s?wd=%s' % urllib.quote_plus(word) html = is_index(url,headers) soup = bs(html) '''提取百度1-10名的块级元素''' b_tags = soup.find_all('div', {'class': 'result c-container '}) if len(b_tags) == 0: #print '当前IP:%s,IP已挂!' % newip continue '''将百度1-10名块级元素的代码下载至serp_html.csv,之后在计算首页词数、展现次数、排名质量分等数据需求''' for line in b_tags: newline = str(line) number = search(r'id="(\d+)"',newline) #urldiv = search(r'<span class="g">(.*?)</span>',newline) #获取源码中domain所在的<span> data = [] data.append(word) data.append(newline) writer = csv.writer(csvfile,dialect='excel') writer.writerow(data) print '>> 已抓取:%s,返回%s条结果' % (word,len(b_tags)) #self.q_ans.put((req,ans)) # 将完成的任务压入完成队列,在主程序中返回 self.q_ans.put(line) with self.lock: self.running -= 1 self.q_req.task_done() # 在完成一项工作之后,Queue.task_done() # 函数向任务已经完成的队列发送一个信号 time.sleep(0.1) # don't spam if __name__ == "__main__": f = Fetcher(threads=10) #设置线程数为10 for url in url_list: f.push(url) #所有url推入下载队列 while f.taskleft(): #若还有未完成的的线程 f.pop() #从下载完成的队列中取出结果 ''' # 如果百度节点超时次数》10,则从百度节点列表中删除,此片段加在Fetcher中 if '错误信息' in html: print html if 'Connection refused' in html: #判断访问超时的节点存入字典,若该节点已超过10次链接超时,则从节点列表中删除 if bdjd_dict.has_key(bdjd): bdjd_dict[bdjd] += 1 print '节点:%s,已%s次超时' % (bdjd,bdjd_dict[bdjd]) if int(bdjd_dict[bdjd]) >= 10: bdjd_list.remove(bdjd) print "节点:%s 已删除" % bdjd else: bdjd_dict[bdjd] = 1 continue '''
运行结果:
返回结果小于10的都是特殊样式,如百度百科、文库、阿拉丁等。源码存入serp_html.csv中
读取serp_html.csv,输出百度排名的原始数据:关键词,排名,网站,百度url,百度标题,以便后续根据需求统计数据
#coding:utf-8 ''' 提取百度排名的原始数据,计算首页排名词数,首页展现量,排名质量分,竞争对手等 ''' import csv,re def search(req,line): text = re.search(req,line) if text: data = text.group(1) else: data = 'no' return data csvfile = file('serp_html.csv','rb') reader = csv.reader(csvfile) '''输出百度搜索结果数据:当前关键词,排名,排名网站,百度url(需转义后才是真实的url),标题''' for line in reader: word = line[0] html = line[1] number = search(r'id="(\d+)"',html) domain = search(r'<span class="g">(.*?)/.*</span>',html) bdurl = search(r'href="(https://www.baidu.com/link\?url=[^"]*?)"',html) title = search(r'"title":"([^"]*?)"',html) print '%s,%s,%s,%s,%s' % (word,number,domain,bdurl,title)
输出结果,入库:
然后通过处理原始排名数据,来计算自己需要的数据,如下:
另外,若大量采集百度需要循环代理验证,每隔2分钟验证代理,将可访问的代理存入hege_daili.txt中,然后抓取百度的程序读取hege_daili.txt,随机抽取使用。代理验证主程序(代理验证.py)如下:
#coding:utf-8 #! /usr/bin/env python # 验证代理ip可用性,将可用的ip保存到‘hege_daili.txt’ import urllib2,zlib,json import re import sys import chardet import threading import time import urllib,os rawProxyList = [] checkedProxyList = [] print "开始获取http代理>>>>>>>>>>>>>>>>>>" #正则提取模块 def search(req,line): text = re.search(req,line) if text: data = text.group(1) else: data = 'no' return data #<!-- 代理来源:购买三方代理api接口,或引入采集代理py,定时循环 --> #<!-- 获取代理ip保存至本地,格式为192.168.0.01:8088 --> dailitxt = open('alldaili.txt','w') url1 = '代理ip接口' #解决经过gzip压缩的网页乱码的问题 request = urllib2.Request(url1) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener() response = opener.open(request) html = response.read() gzipped = response.headers.get('Content-Encoding') if gzipped: html = zlib.decompress(html, 16+zlib.MAX_WBITS) dailitxt.write(html) dailitxt.close() #<!-- 读取代理ip文件,以list形式赋值给rawProxtList --> for ip in open('alldaili.txt'): ip = ip.strip() rawProxyList.append(ip) number = len(rawProxyList) print "以获取%s个代理" % number #<!-- 验证代理的类 --> class ProxyCheck(threading.Thread): def __init__(self,proxyList): threading.Thread.__init__(self) self.proxyList = proxyList self.timeout = 10 self.testUrl = "https://www.baidu.com/" self.testStr = "030173" def checkProxy(self): cookies = urllib2.HTTPCookieProcessor() for proxy in self.proxyList: proxyHandler = urllib2.ProxyHandler({"http" : r'https://%s' % proxy}) opener = urllib2.build_opener(cookies,proxyHandler) opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1')] t1 = time.time() try: req = opener.open(self.testUrl, timeout=self.timeout) result = req.read() timeused = time.time() - t1 pos = result.find(self.testStr) title = re.search(r'<title>(.*?)</title>',urllib2.urlopen('https://www.baidu.com').read()) if title: baidu_title = title.group(1) else: baidu_title = '无法打开网页' print "正在检验:%s,链接速度为%s,百度标题为:%s" % (proxy,timeused,baidu_title) if (pos > -1) and (timeused < 5) and baidu_title == '百度一下,你就知道': checkedProxyList.append((proxy,timeused)) else: continue except Exception,e: print e.message continue def sort(self): sorted(checkedProxyList,cmp=lambda x,y:cmp(x[1],y[1])) def run(self): self.checkProxy() self.sort() print "开始验证http代理>>>>>>>>>>>>>>>>>>>>>>>" if __name__ == "__main__": getThreads = [] checkThreads = [] #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份 for i in range(35): t = ProxyCheck(rawProxyList[((len(rawProxyList)+34)/35) * i:((len(rawProxyList)+34)/35) * (i+1)]) checkThreads.append(t) for i in range(len(checkThreads)): checkThreads[i].start() for i in range(len(checkThreads)): checkThreads[i].join() print "\n" print ".......................总共%s个代理,共有%s个通过校验......................." % (len(rawProxyList),len(checkedProxyList)) #合格ip添加至本地文件,再次写入时先清空在写入,防止ip重复 f= open("hege_daili.txt",'w+') for proxy in checkedProxyList: print "qualified: %s\t%s" % (proxy[0],proxy[1]) f.write(proxy[0]+"\n") f.close()
代理验证循环(cycle.py),调用代理验证程序,每隔2分钟重新验证
#coding:utf-8 ''' 定时执行任务,每隔2分钟,获取最新代理并验证 ''' import os,time while 1: os.system("python 代理验证.py") time.sleep(120)
跑数据时同时运行代理验证:cycle.py,和抓取百度源码的程序,下载到本地后处理成原始排名数据,在根据需求计算对应的数据。
百度防采集做的比较狠,貌似不同节点编码还不一样,有gbk有utf8,好多刚买来的代理一访问就返回403,早就加黑了,用代理抓百度劳民伤财又费心,不差钱的还是搞个258IP服务器抓比较好。
360防采集做的比较渣渣,上面的程序针对360改下,可以24小时不间断的运行,一天几十万没问题。
文章来自GOGO闯,微信公众号:流量贩子