python多线程代理查询百度关键词排名

将百度搜索结果中1-10位的块级元素的源码片段下载存入本地,稍后计算首页词数、首页展现量、排名质量分之类的数据。为何要先将源码存入本地?如果只是查询一个网站的排名的话,请求网页、正则匹配对应排名id可以写入一个多线程函数中执行,但若还要更多维度的数据,比如竞争对手的排名情况、计算排名质量分、展现的url类型等等,在写入一个函数内很可能会造成队列错乱,导致假死,所以拆成多个py运行,一个py实现指定的功能。

抓取百度的程序:获取百度1-10位块级元素的源码:

#coding:utf-8
'''
百度排名查询,代理多线程版本
'''
import StringIO,pycurl,time,random,re,os,csv,urllib
from threading import Thread,Lock
from Queue import Queue
from bs4 import BeautifulSoup as bs
 
csvfile = open('serp_html.csv','wb')    #存放关键词和搜索结果页源码的文件
bdjd_dict = {}
daili_list = [] #存储代理ip
 
'''随机提取代理ip'''
def ip():
    for x in open('hege_daili.txt'):
        x = x.strip()
        daili_list.append(x)
    newip = random.choice(daili_list)
    return newip
 
'''若代理ip超过10次连接失败,则从代理ip文件中删除'''
def daili_delete(ip):
    dailifile = open('daili_beifen.txt','w')
    for line in open('hege_daili.txt'):
        line = line.strip()
        if ip not in line:
            dailifile.write(line+"\n")
    os.system("mv daili_beifen.txt hege_daili.txt")
 
#百度节点
bdjd_list = ["www.baidu.com","180.97.33.107","115.239.210.27","180.97.33.108","180.97.33.107","180.97.33.107","180.97.33.108","220.181.111.188","220.181.111.188","180.97.33.107","180.97.33.107","115.239.211.112","180.97.33.108","180.97.33.108","180.97.33.108","180.97.33.108","180.97.33.108","115.239.211.112","180.97.33.108","115.239.211.112","115.239.210.27","180.97.33.108","115.239.211.112","115.239.210.27","180.97.33.108","115.239.210.27","61.135.169.125","115.239.211.112","115.239.210.27","180.97.33.107","180.97.33.107","180.97.33.108","115.239.210.27","180.97.33.107","61.135.169.121","115.239.210.27","61.135.169.121","61.135.169.125","115.239.211.112","115.239.210.27","61.135.169.125","112.80.248.73","61.135.169.121","112.80.248.74","112.80.248.73","61.135.169.125","180.97.33.108","115.239.210.27","61.135.169.125","61.135.169.125","112.80.248.74","112.80.248.74","61.135.169.121","115.239.210.27","61.135.169.125","111.13.100.92","111.13.100.92","111.13.100.91","111.13.100.91","115.239.211.112","111.13.100.92","111.13.100.91","111.13.100.92","115.239.211.112","115.239.210.27","115.239.211.112","115.239.210.27","115.239.210.27","115.239.210.27","115.239.210.27"]
#bdjd_list = ["www.baidu.com"]
 
#定义UA
def getUA():
    uaList = ['Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',
    'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)',
    'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
    ]
    headers = random.choice(uaList)
    return headers
 
#随机抽取百度节点
def getBDJD(bdjd_str):
    bdjd_list = bdjd_str.split(',')
    bdjd = random.choice(bdjd_list)
    return bdjd
 
'''
获取百度源码,若要使用代理,则需引入ip参数,即 is_index(url,headers,ip) ,并取消 c.setopt(c.PROXY,ip) #号的注释
'''
def is_index(url,headers):
    while 1:
        try:
            c = pycurl.Curl()
            c.setopt(pycurl.MAXREDIRS,5)
            c.setopt(pycurl.REFERER, url)
            c.setopt(pycurl.FOLLOWLOCATION, True)
            c.setopt(pycurl.CONNECTTIMEOUT, 120)
            c.setopt(pycurl.TIMEOUT,120)
            c.setopt(pycurl.ENCODING,'gzip,deflate')
            #c.setopt(c.PROXY,ip)       '''若使用代理,则取消本行注释'''
            c.fp = StringIO.StringIO()
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.HTTPHEADER,headers)
            c.setopt(c.WRITEFUNCTION, c.fp.write)
            c.perform()
            #code = c.getinfo(c.HTTP_CODE) 返回状态码
            html = c.fp.getvalue()
            if '="https://verify.baidu.com' in html:
                print '出验证码,暂停10分钟'
                time.sleep(1200)
                continue
            else:
                return html
        except Exception, what:
            information = '错误信息:%s' % what
            return str(information)
            continue
 
#正则提取模块
def search(req,line):
    text = re.search(req,line)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data
 
'''需要查询排名的关键词传入url_list,关键词存放在word中,一行一个'''
url_list = []
for line in open('word'):
    word = line.strip()
    url_list.append(word)
 
'''多线程抓取函数'''
class Fetcher:
    def __init__(self,threads):
        self.lock = Lock() #线程锁
        self.q_req = Queue() #任务队列
        self.q_ans = Queue() #完成队列
        self.threads = threads
        for i in range(threads):
            t = Thread(target=self.threadget) #括号中的是每次线程要执行的任务
            t.setDaemon(True) #设置子线程是否随主线程一起结束,必须在start()
                              #之前调用。默认为False
            t.start() #启动线程
        self.running = 0 #设置运行中的线程个数
 
    def __del__(self): #解构时需等待两个队列完成
        time.sleep(0.5)
        self.q_req.join() #Queue等待队列为空后再执行其他操作
        self.q_ans.join()
 
    #返回还在运行线程的个数,为0时表示全部运行完毕
    def taskleft(self):
        return self.q_req.qsize()+self.q_ans.qsize()+self.running 
 
    def push(self,req):
        self.q_req.put(req)
 
    def pop(self):
        return self.q_ans.get()
 
    #线程执行的任务,根据req来区分
    def threadget(self):
        while True:
            line = self.q_req.get()
            word = line.strip()
 
            '''
            Lock.lock()操作,使用with可以不用显示调用acquire和release,
            这里锁住线程,使得self.running加1表示运行中的线程加1,
            如此做防止其他线程修改该值,造成混乱。
            with下的语句结束后自动解锁。
            '''
 
            with self.lock:
                self.running += 1
 
            '''构造请求头,header请自行修改'''
            headers = [
                "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding:gzip, deflate, sdch",
                "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6",
                "Cache-Control:max-age=0",
                "Connection:keep-alive",
                "Cookie:BAIDUID=EC6ED338982C9DE1ED39972F1B4E5530:FG=1; BIDUPSID=EC6ED338982C9DE1ED39972F1B4E5530; PSTM=1434515748; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a01845311744; BDUSS=35KeW10a3pvNXdNMjQyVnhLUHFoYzZUSW9EVUF-ZXE1bUNuTXFFa0hTVU1tYWhWQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwMgVUMDIFVZ; ispeed_lsm=2; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BAIDUVERIFY=4606BC10EB2C2AB6D4B79FFA477238E38C5C6D295106222D2F75D4F8E9C766D5115D55C4E44133ACFE00D4768C6C1507E06B4C7D6221795299EE03FDE7CB2D46430000:1434541766:3f9b93890a7a5ed0; BDRCVFR[ltbVPlNi2ac]=mbxnW11j9Dfmh7GuZR8mvqV; BD_UPN=123253; sug=3; sugstore=1; ORIGIN=0; bdime=20100; H_PS_645EC=4093eiyri7wdn3v3qfviJa%2FFLXYwKxu%2FIF0wtL7d7pZ9maSmOTmtgqORRMlMRo7E; BD_CK_SAM=1; BDSVRTM=59; H_PS_PSSID=14795_1433_14602_14509_14444_14734_10812_12868_14622_14871_12723_14485_14919_14903_11664_13937_13189_10632",
                "Host:www.baidu.com",
                "RA-Sid:7739A016-20140918-030243-3adabf-48f828",
                "RA-Ver:2.10.4",
                "User-Agent:%s" % getUA()
            ]    
 
            bdjd_str = ','.join(bdjd_list)
 
            #newip = ip()   '''若使用代理,则取消本行注释'''
            #bdjd = getBDJD(bdjd_str)   '''若使用百度节点,则取消本行注释,并在构造url时引入bdjd'''
            url = 'https://www.baidu.com/s?wd=%s' % urllib.quote_plus(word)
 
            html = is_index(url,headers)
            soup = bs(html)
            '''提取百度1-10名的块级元素'''
            b_tags = soup.find_all('div', {'class': 'result c-container '})    
 
            if len(b_tags) == 0:
                #print '当前IP:%s,IP已挂!' % newip
                continue
 
            '''将百度1-10名块级元素的代码下载至serp_html.csv,之后在计算首页词数、展现次数、排名质量分等数据需求'''
            for line in b_tags:
                newline = str(line)
                number = search(r'id="(\d+)"',newline)
                #urldiv = search(r'<span class="g">(.*?)</span>',newline)    #获取源码中domain所在的<span>
 
                data = []
                data.append(word)
                data.append(newline)
                writer = csv.writer(csvfile,dialect='excel')
                writer.writerow(data)
 
            print '>> 已抓取:%s,返回%s条结果' % (word,len(b_tags))
 
            #self.q_ans.put((req,ans)) # 将完成的任务压入完成队列,在主程序中返回
            self.q_ans.put(line)
            with self.lock:
                self.running -= 1
            self.q_req.task_done() # 在完成一项工作之后,Queue.task_done()
                                   # 函数向任务已经完成的队列发送一个信号
            time.sleep(0.1) # don't spam
 
if __name__ == "__main__":
    f = Fetcher(threads=10) #设置线程数为10
    for url in url_list:
        f.push(url)         #所有url推入下载队列
    while f.taskleft():     #若还有未完成的的线程
        f.pop()   #从下载完成的队列中取出结果
 
'''
# 如果百度节点超时次数》10,则从百度节点列表中删除,此片段加在Fetcher中
if '错误信息' in html:
    print html
    if 'Connection refused' in html:
        #判断访问超时的节点存入字典,若该节点已超过10次链接超时,则从节点列表中删除
        if bdjd_dict.has_key(bdjd):
            bdjd_dict[bdjd] += 1
            print '节点:%s,已%s次超时' % (bdjd,bdjd_dict[bdjd])
            if int(bdjd_dict[bdjd]) >= 10:
                bdjd_list.remove(bdjd)
                print "节点:%s 已删除" % bdjd
        else:
            bdjd_dict[bdjd] = 1
    continue
'''

运行结果:
排名数据
返回结果小于10的都是特殊样式,如百度百科、文库、阿拉丁等。源码存入serp_html.csv中

读取serp_html.csv,输出百度排名的原始数据:关键词,排名,网站,百度url,百度标题,以便后续根据需求统计数据

#coding:utf-8
'''
提取百度排名的原始数据,计算首页排名词数,首页展现量,排名质量分,竞争对手等
'''
 
import csv,re
 
def search(req,line):
    text = re.search(req,line)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data
 
csvfile = file('serp_html.csv','rb')
reader = csv.reader(csvfile)
 
'''输出百度搜索结果数据:当前关键词,排名,排名网站,百度url(需转义后才是真实的url),标题'''
for line in reader:
    word = line[0]
    html = line[1]
 
    number = search(r'id="(\d+)"',html)
    domain = search(r'<span class="g">(.*?)/.*</span>',html)
    bdurl = search(r'href="(https://www.baidu.com/link\?url=[^"]*?)"',html)
    title = search(r'"title":"([^"]*?)"',html)
 
    print '%s,%s,%s,%s,%s' % (word,number,domain,bdurl,title)

输出结果,入库:
入库
然后通过处理原始排名数据,来计算自己需要的数据,如下:
排名数据
另外,若大量采集百度需要循环代理验证,每隔2分钟验证代理,将可访问的代理存入hege_daili.txt中,然后抓取百度的程序读取hege_daili.txt,随机抽取使用。代理验证主程序(代理验证.py)如下:

#coding:utf-8
#! /usr/bin/env python
# 验证代理ip可用性,将可用的ip保存到‘hege_daili.txt’
 
import urllib2,zlib,json
import re
import sys
import chardet
import threading
import time
import urllib,os
 
rawProxyList = []
checkedProxyList = []
 
print "开始获取http代理>>>>>>>>>>>>>>>>>>"
 
#正则提取模块
def search(req,line):
    text = re.search(req,line)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data
 
#<!-- 代理来源:购买三方代理api接口,或引入采集代理py,定时循环 -->
#<!-- 获取代理ip保存至本地,格式为192.168.0.01:8088 -->
 
dailitxt = open('alldaili.txt','w')
url1 = '代理ip接口'
 
#解决经过gzip压缩的网页乱码的问题
request = urllib2.Request(url1)
request.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
response = opener.open(request)
html = response.read()
gzipped = response.headers.get('Content-Encoding')
if gzipped:
    html = zlib.decompress(html, 16+zlib.MAX_WBITS)
dailitxt.write(html)
 
dailitxt.close()
 
#<!-- 读取代理ip文件,以list形式赋值给rawProxtList -->
for ip in open('alldaili.txt'):
    ip = ip.strip()
    rawProxyList.append(ip)
    number = len(rawProxyList)
print "以获取%s个代理" % number
 
#<!-- 验证代理的类 -->
class ProxyCheck(threading.Thread):
    def __init__(self,proxyList):
        threading.Thread.__init__(self)
        self.proxyList = proxyList
        self.timeout = 10
        self.testUrl = "https://www.baidu.com/"
        self.testStr = "030173"
 
    def checkProxy(self):
        cookies = urllib2.HTTPCookieProcessor()
        for proxy in self.proxyList:
            proxyHandler = urllib2.ProxyHandler({"http" : r'https://%s' % proxy})
            opener = urllib2.build_opener(cookies,proxyHandler)
            opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1')]
            t1 = time.time()
 
            try:
                req = opener.open(self.testUrl, timeout=self.timeout)
                result = req.read()
                timeused = time.time() - t1
                pos = result.find(self.testStr)
 
                title = re.search(r'<title>(.*?)</title>',urllib2.urlopen('https://www.baidu.com').read())
                if title:
                    baidu_title = title.group(1)
                else:
                    baidu_title = '无法打开网页'
 
                print "正在检验:%s,链接速度为%s,百度标题为:%s" % (proxy,timeused,baidu_title)
 
                if (pos > -1) and (timeused < 5) and baidu_title == '百度一下,你就知道':
                    checkedProxyList.append((proxy,timeused))
                else:
                    continue
 
            except Exception,e:
                print e.message
                continue
 
    def sort(self):
        sorted(checkedProxyList,cmp=lambda x,y:cmp(x[1],y[1]))
 
    def run(self):
        self.checkProxy()
        self.sort()
 
print "开始验证http代理>>>>>>>>>>>>>>>>>>>>>>>"
if __name__ == "__main__":
    getThreads = []
    checkThreads = []
 
    #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份
    for i in range(35):
        t = ProxyCheck(rawProxyList[((len(rawProxyList)+34)/35) * i:((len(rawProxyList)+34)/35) * (i+1)])
        checkThreads.append(t)
 
    for i in range(len(checkThreads)):
        checkThreads[i].start()
 
    for i in range(len(checkThreads)):
        checkThreads[i].join()
 
    print "\n"
    print ".......................总共%s个代理,共有%s个通过校验......................." % (len(rawProxyList),len(checkedProxyList))
 
    #合格ip添加至本地文件,再次写入时先清空在写入,防止ip重复
    f= open("hege_daili.txt",'w+')
    for proxy in checkedProxyList:
        print "qualified: %s\t%s" % (proxy[0],proxy[1])
        f.write(proxy[0]+"\n")
    f.close()

代理验证循环(cycle.py),调用代理验证程序,每隔2分钟重新验证

#coding:utf-8
'''
定时执行任务,每隔2分钟,获取最新代理并验证
'''
 
import os,time
 
while 1:
    os.system("python 代理验证.py")
    time.sleep(120)

跑数据时同时运行代理验证:cycle.py,和抓取百度源码的程序,下载到本地后处理成原始排名数据,在根据需求计算对应的数据。
百度防采集做的比较狠,貌似不同节点编码还不一样,有gbk有utf8,好多刚买来的代理一访问就返回403,早就加黑了,用代理抓百度劳民伤财又费心,不差钱的还是搞个258IP服务器抓比较好。
360防采集做的比较渣渣,上面的程序针对360改下,可以24小时不间断的运行,一天几十万没问题。

文章来自GOGO闯,微信公众号:流量贩子
wpid-a5d1db71632cbddb7a9824a83b6890c4_0_6.jpg

Leave a Comment