SEO工具:百度PC排名、移动排名、PC相关搜索词、PC收录数据查询

【Python】总有一款seo工具适合你,不信你点进来看看啊!
一个脚本满足多个seo需求:

  • 批量查询百度PC排名,移动排名,PC收录数据,通过serp提取排名num和收录data
  • python关键词挖掘,通过百度相关搜索词接口爬取
  • #coding:utf-8
    
    import requests,re,time,sys,json,datetime,urllib
    import multiprocessing
    from lxml import etree
    
    '''  
    @@@@ 请依次传入三个参数:@@@
    
    1、是否开启多进程(open/close)
    2、查询客户端(pc/wap/xgss/shoulu)
    3、查询关键词文件路径
    
    case:python baidu_rank.py pc keyword.txt 
    '''
    
    outfile = open('result','w')
    
    
    client = raw_input("1、请输入查询方式:PC排名(pc)、移动排名(wap)、PC相关搜索(xgss)、PC索引(shoulu)?\n")
    wordfile = raw_input("2、请输入载入文件路径:\n")
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    current_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    
    
    
    def search(req,html):
        text = re.search(req,html)
        if text:
            data = text.group(1)
        else:
            data = 'no'
        return data
    
    def number(content):
        text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),content)  #去除中英文标点符号
        text2 = re.sub('<[^>]*?>','',text)  #去除所有标签
        words_number = len(text2)
        return int(words_number)
    
    def getHTml(url,client):
    
        host = search('^([^/]*?)/',re.sub(r'(https|http)://','',url))
    
        if client == 'pc' or client == 'shoulu':
            headers = {
                "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding":"gzip, deflate, sdch",
                "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
                "Cache-Control":"no-cache",
                "Connection":"keep-alive",
                #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947",
                "Host":host,
                "Pragma":"no-cache",
                "Upgrade-Insecure-Requests":"1",
                "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
            }
    
        elif client == 'wap':
            headers = {
                "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding":"gzip, deflate, sdch",
                "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
                "Cache-Control":"no-cache",
                "Connection":"keep-alive",
                #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; plus_cv=1::m:2a9fb36a; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; lsv=globalTjs_97273d6-wwwTcss_bf2b167-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_9f22dd4-framejs_38dd0ce-globalBjs_1c30bc8-sugjs_e1176fe-wwwjs_9f21ca8; H_WISE_SIDS=102065_100040_109672_102432_107851_109607_104340_106264_110031_108437_109699_107960_108453_109738_110201_110022_107896_109683_109668_109588_108013_107320_107242; MSA_WH=1433_216; MSA_PBT=92; MSA_ZOOM=1000; BAIDUID=8ADD01F376F3A0D29ED11B9D017537E9:FG=1; wpr=0; BDICON=10123156",
                "Host":host,
                "Pragma":"no-cache",
                "Upgrade-Insecure-Requests":"1",
                "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
            }
    
        elif client == 'xgss':
             headers = {
                "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding":"gzip, deflate, sdch",
                "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
                "Cache-Control":"no-cache",
                "Connection":"keep-alive",
                #"Cookie":"BAIDUID=9D18417A1E66FC499DB4DDDCA3CB2914:FG=1; PSTM=1482157958; BIDUPSID=9D18417A1E66FC499DB4DDDCA3CB2914; BDUSS=kRqbnhGeDBaSH5HelVKfktVT1NpQk1HcndFclB-VzEzdExFLVBpNFJoemhnb0ZZSVFBQUFBJCQAAAAAAAAAAAEAAADZAEZLU8ewz99TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOH1WVjh9VlYS; ispeed_lsm=0; BDSFRCVID=bI-sJeCCxG3G87biS4_bj3Pvv3CwjeFp3e4p3J; H_BDCLCKID_SF=JJF8oDDhJKvbfP0kKJo5MbLt2frXetJyaR3thpQbWJ5TMC_wh4RcLtCt0UcbQUrO5Dbz0l7wQl38ShPC-tnZX68h5tTgJPKO5m8j2Cns3l02V-jIe-t2ynQDDxJma4RMW20jWl7mWU5jVKFljTu2j5c0eUbX-I6E2I6yQnT8HJOoDDvPyDc5y4LdLp7xJM-OJKr0bKb7KJF5V-QCDUbbM4LzKab-tn8eWJQ2QJ8BJC0MMI3P; pgv_pvi=1762722816; pgv_si=s4606412800; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_UPN=123253; sug=3; sugstore=0; ORIGIN=0; bdime=20100; H_PS_645EC=59ceduqR707UyvU6gKhGPNOVRJs1nF13nLKncP7DqEfiKlajBDdAj48fqg%2BC3hFY; BDRCVFR[FYP17ZXncD_]=mk3SLVN4HKm; BD_CK_SAM=1; PSINO=5; BDSVRTM=79; H_PS_PSSID=",
                "Host":"www.baidu.com",
                "Pragma":"no-cache",
                "Upgrade-Insecure-Requests":"1",
                "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36",
            }
    
        else:
            print '查询客户端参数错误!!!!!'    
    
    
        # 代理服务器
        proxyHost = "proxy.abuyun.com"
        proxyPort = "9010"
    
        # 代理隧道验证信息
        proxyUser = "HJQ044255HV9182P"
        proxyPass = "0B43E0B5352C5033"
    
        proxyMeta = "https://%(user)s:%(pass)s@%(host)s:%(port)s" % {
          "host" : proxyHost,
          "port" : proxyPort,
          "user" : proxyUser,
          "pass" : proxyPass,
        }
    
        proxies = {
            "http"  : proxyMeta,
            "https" : proxyMeta,
        }
    
    
        html = requests.get(url,headers=headers,timeout=30)
        code = html.encoding
        return html.content
    
    def date(timeStamp):
        timeArray = time.localtime(timeStamp)
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        return otherStyleTime
    
    def getContent(word,client):
        #查百度PC排名
        if client == 'pc':
            pcurl = 'https://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word
            print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % pcurl
            html = getHTml(pcurl,client)
    
            html_dict = json.loads(html)
            for tag in html_dict['feed']['entry']:
                if tag.has_key('title'):
                    title = tag['title']
                    url = tag['url']
                    rank = tag['pn']
                    time = date(tag['time'])
                    outfile.write('%s,%s,%s,%s,%s\n' % (word,rank,url,title,time))
                    print rank,url
            return 1
        #查百度移动排名
        elif client == 'wap':
            wapurl = 'https://m.baidu.com/s?pn=0&usm=2&word=%s&sa=np' % word
            print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % wapurl
            html = getHTml(wapurl,client)
    
            tree = etree.HTML(html) 
            div = tree.xpath('//*[@id="results"]/div')  # 通过xpath路径提取元素
            for line in div:
                line_html = etree.tostring(line)    #通过etree.tostring方法得到一个html
                print line_html
    
                title = re.sub('<[^>]*?>','',search(r'<h3 class="c-title[^>]*?>([\s\S]*?)</h3>',line_html))
                rank = search(r'order="(\d+)"',line_html)
                domain = search(r'<div class="c-showurl c-line-clamp1"><span[^>]*?>(.*?)</span>',line_html)
                if domain == 'no':
                    domain = search(r'<div class="c-showurl">(.*?)\s+\d+k</div>',line_html)
                if domain == 'no':
                    domain = search(r'<span class="c-color-url">(.*?)</span>',line_html)
                if domain == 'no':
                    domain = search(r'<div class="c-color-url">(.*?)</div>',line_html)
                if domain == 'no':
                    domain = search('<span class="site">(.*?)</span>',line_html)
                if domain == 'no':
                    domain = search(r'<div class="c-showurl c-line-clamp1">(.*?) \d+k<span',line_html)
                if domain == 'no':
                    domain = '搜索特型'
                print rank,domain
                outfile.write('%s,%s,%s\n' % (word,rank,domain))
            return 1
        #获取pc百度相关搜索词
        elif client == 'xgss':
            print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % word
            url = 'https://www.baidu.com/s?wd=%s&tn=baidurs2top' % word
            try:
                html = getHTml(url,client)
                for i in html.split(','):
                    print i
                    outfile.write('"%s","%s"\n' % (word,i))
            except:
                print 'Error'
        #查pc百度收录数据
        elif client == "shoulu":
            print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" % word
            url = "https://www.baidu.com/s?wd=%s&pn=0&rn=1&tn=json" % word
            html = getHTml(url,client)
            html_dict = json.loads(html)
            if html_dict['feed']['entry'] == [{}]:
                include = "未收录"
            else:
                line = html_dict['feed']['entry'][0]
                link = line["url"]
                date = line["time"]
    
                include = date
    
            print url,include
    
            outfile.write("%s,%s\n" % (url,include))
    
        else:
            return 'Error'
    
    
    
    words = open(wordfile).readlines()
    pool = multiprocessing.Pool(processes=3)
    for word in words:
        word = word.strip()
        pool.apply_async(getContent, (word,client ))
    pool.close()
    pool.join()
    
  • 执行本脚本,依次输入要执行的部分,如“pc–查询pc排名、xgss–跑相关搜索….”和存放url、关键词文件的路径
  • 代理程序默认使用阿布云动态代理
  • 默认使用多线程,线程数为3,可根据需求自行调整线程数。更改pool = multiprocessing.Pool(processes=3)中processes的值即可
  • 文章来源:GOGO闯 www.kaopuseo.com

    Leave a Comment