SEO工具：百度PC排名、移动排名、PC相关搜索词、PC收录数据查询

【Python】总有一款seo工具适合你，不信你点进来看看啊！
一个脚本满足多个seo需求：
批量查询百度PC排名，移动排名，PC收录数据，通过serp提取排名num和收录data
python关键词挖掘，通过百度相关搜索词接口爬取
#coding:utf-8

import requests,re,time,sys,json,datetime,urllib
import multiprocessing
from lxml import etree

'''  
@@@@ 请依次传入三个参数：@@@

1、是否开启多进程（open/close）
2、查询客户端（pc/wap/xgss/shoulu）
3、查询关键词文件路径

case：python baidu_rank.py pc keyword.txt 
'''

outfile = open('result','w')


client = raw_input("1、请输入查询方式：PC排名（pc）、移动排名（wap）、PC相关搜索（xgss）、PC索引（shoulu）？\n")
wordfile = raw_input("2、请输入载入文件路径：\n")

reload(sys)
sys.setdefaultencoding('utf-8')

current_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))



def search(req,html):
    text = re.search(req,html)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data

def number(content):
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，:：。？、~@#￥%……&*（）“”《》]+".decode("utf8"), "".decode("utf8"),content)  #去除中英文标点符号
    text2 = re.sub('<[^>]*?>','',text)  #去除所有标签
    words_number = len(text2)
    return int(words_number)

def getHTml(url,client):

    host = search('^([^/]*?)/',re.sub(r'(https|http)://','',url))

    if client == 'pc' or client == 'shoulu':
        headers = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding":"gzip, deflate, sdch",
            "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
            "Cache-Control":"no-cache",
            "Connection":"keep-alive",
            #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947",
            "Host":host,
            "Pragma":"no-cache",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
        }

    elif client == 'wap':
        headers = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding":"gzip, deflate, sdch",
            "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
            "Cache-Control":"no-cache",
            "Connection":"keep-alive",
            #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; plus_cv=1::m:2a9fb36a; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; lsv=globalTjs_97273d6-wwwTcss_bf2b167-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_9f22dd4-framejs_38dd0ce-globalBjs_1c30bc8-sugjs_e1176fe-wwwjs_9f21ca8; H_WISE_SIDS=102065_100040_109672_102432_107851_109607_104340_106264_110031_108437_109699_107960_108453_109738_110201_110022_107896_109683_109668_109588_108013_107320_107242; MSA_WH=1433_216; MSA_PBT=92; MSA_ZOOM=1000; BAIDUID=8ADD01F376F3A0D29ED11B9D017537E9:FG=1; wpr=0; BDICON=10123156",
            "Host":host,
            "Pragma":"no-cache",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
        }

    elif client == 'xgss':
         headers = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding":"gzip, deflate, sdch",
            "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
            "Cache-Control":"no-cache",
            "Connection":"keep-alive",
            #"Cookie":"BAIDUID=9D18417A1E66FC499DB4DDDCA3CB2914:FG=1; PSTM=1482157958; BIDUPSID=9D18417A1E66FC499DB4DDDCA3CB2914; BDUSS=kRqbnhGeDBaSH5HelVKfktVT1NpQk1HcndFclB-VzEzdExFLVBpNFJoemhnb0ZZSVFBQUFBJCQAAAAAAAAAAAEAAADZAEZLU8ewz99TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOH1WVjh9VlYS; ispeed_lsm=0; BDSFRCVID=bI-sJeCCxG3G87biS4_bj3Pvv3CwjeFp3e4p3J; H_BDCLCKID_SF=JJF8oDDhJKvbfP0kKJo5MbLt2frXetJyaR3thpQbWJ5TMC_wh4RcLtCt0UcbQUrO5Dbz0l7wQl38ShPC-tnZX68h5tTgJPKO5m8j2Cns3l02V-jIe-t2ynQDDxJma4RMW20jWl7mWU5jVKFljTu2j5c0eUbX-I6E2I6yQnT8HJOoDDvPyDc5y4LdLp7xJM-OJKr0bKb7KJF5V-QCDUbbM4LzKab-tn8eWJQ2QJ8BJC0MMI3P; pgv_pvi=1762722816; pgv_si=s4606412800; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_UPN=123253; sug=3; sugstore=0; ORIGIN=0; bdime=20100; H_PS_645EC=59ceduqR707UyvU6gKhGPNOVRJs1nF13nLKncP7DqEfiKlajBDdAj48fqg%2BC3hFY; BDRCVFR[FYP17ZXncD_]=mk3SLVN4HKm; BD_CK_SAM=1; PSINO=5; BDSVRTM=79; H_PS_PSSID=",
            "Host":"www.baidu.com",
            "Pragma":"no-cache",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
        }

    else:
        print '查询客户端参数错误！！！！！'    


    # 代理服务器
    proxyHost = "proxy.abuyun.com"
    proxyPort = "9010"

    # 代理隧道验证信息
    proxyUser = "HJQ044255HV9182P"
    proxyPass = "0B43E0B5352C5033"

    proxyMeta = "https://%(user)s:%(pass)s@%(host)s:%(port)s" % {
      "host" : proxyHost,
      "port" : proxyPort,
      "user" : proxyUser,
      "pass" : proxyPass,
    }

    proxies = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }


    html = requests.get(url,headers=headers,timeout=30)
    code = html.encoding
    return html.content

def date(timeStamp):
    timeArray = time.localtime(timeStamp)
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    return otherStyleTime

def getContent(word,client):
    #查百度PC排名
    if client == 'pc':
        pcurl = 'https://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % pcurl
        html = getHTml(pcurl,client)

        html_dict = json.loads(html)
        for tag in html_dict['feed']['entry']:
            if tag.has_key('title'):
                title = tag['title']
                url = tag['url']
                rank = tag['pn']
                time = date(tag['time'])
                outfile.write('%s,%s,%s,%s,%s\n' % (word,rank,url,title,time))
                print rank,url
        return 1
    #查百度移动排名
    elif client == 'wap':
        wapurl = 'https://m.baidu.com/s?pn=0&usm=2&word=%s&sa=np' % word
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % wapurl
        html = getHTml(wapurl,client)

        tree = etree.HTML(html) 
        div = tree.xpath('//*[@id="results"]/div')  # 通过xpath路径提取元素
        for line in div:
            line_html = etree.tostring(line)    #通过etree.tostring方法得到一个html
            print line_html

            title = re.sub('<[^>]*?>','',search(r'<h3 class="c-title[^>]*?>([\s\S]*?)</h3>',line_html))
            rank = search(r'order="(\d+)"',line_html)
            domain = search(r'<div class="c-showurl c-line-clamp1"><span[^>]*?>(.*?)</span>',line_html)
            if domain == 'no':
                domain = search(r'<div class="c-showurl">(.*?)\s+\d+k</div>',line_html)
            if domain == 'no':
                domain = search(r'<span class="c-color-url">(.*?)</span>',line_html)
            if domain == 'no':
                domain = search(r'<div class="c-color-url">(.*?)</div>',line_html)
            if domain == 'no':
                domain = search('<span class="site">(.*?)</span>',line_html)
            if domain == 'no':
                domain = search(r'<div class="c-showurl c-line-clamp1">(.*?) \d+k<span',line_html)
            if domain == 'no':
                domain = '搜索特型'
            print rank,domain
            outfile.write('%s,%s,%s\n' % (word,rank,domain))
        return 1
    #获取pc百度相关搜索词
    elif client == 'xgss':
        print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % word
        url = 'https://www.baidu.com/s?wd=%s&tn=baidurs2top' % word
        try:
            html = getHTml(url,client)
            for i in html.split(','):
                print i
                outfile.write('"%s","%s"\n' % (word,i))
        except:
            print 'Error'
    #查pc百度收录数据
    elif client == "shoulu":
        print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" % word
        url = "https://www.baidu.com/s?wd=%s&pn=0&rn=1&tn=json" % word
        html = getHTml(url,client)
        html_dict = json.loads(html)
        if html_dict['feed']['entry'] == [{}]:
            include = "未收录"
        else:
            line = html_dict['feed']['entry'][0]
            link = line["url"]
            date = line["time"]

            include = date

        print url,include

        outfile.write("%s,%s\n" % (url,include))

    else:
        return 'Error'



words = open(wordfile).readlines()
pool = multiprocessing.Pool(processes=3)
for word in words:
    word = word.strip()
    pool.apply_async(getContent, (word,client ))
pool.close()
pool.join()
执行本脚本，依次输入要执行的部分，如“pc–查询pc排名、xgss–跑相关搜索….”和存放url、关键词文件的路径
代理程序默认使用阿布云动态代理
默认使用多线程，线程数为3，可根据需求自行调整线程数。更改pool = multiprocessing.Pool(processes=3)中processes的值即可
文章来源：GOGO闯 www.kaopuseo.com
相关文章:

Leave a Comment 取消回复