【Python】总有一款seo工具适合你,不信你点进来看看啊!
一个脚本满足多个seo需求:
#coding:utf-8 import requests,re,time,sys,json,datetime,urllib import multiprocessing from lxml import etree ''' @@@@ 请依次传入三个参数:@@@ 1、是否开启多进程(open/close) 2、查询客户端(pc/wap/xgss/shoulu) 3、查询关键词文件路径 case:python baidu_rank.py pc keyword.txt ''' outfile = open('result','w') client = raw_input("1、请输入查询方式:PC排名(pc)、移动排名(wap)、PC相关搜索(xgss)、PC索引(shoulu)?\n") wordfile = raw_input("2、请输入载入文件路径:\n") reload(sys) sys.setdefaultencoding('utf-8') current_date = time.strftime('%Y-%m-%d',time.localtime(time.time())) def search(req,html): text = re.search(req,html) if text: data = text.group(1) else: data = 'no' return data def number(content): text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),content) #去除中英文标点符号 text2 = re.sub('<[^>]*?>','',text) #去除所有标签 words_number = len(text2) return int(words_number) def getHTml(url,client): host = search('^([^/]*?)/',re.sub(r'(https|http)://','',url)) if client == 'pc' or client == 'shoulu': headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control":"no-cache", "Connection":"keep-alive", #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; lsv=globalTjs_97273d6-wwwTcss_8eba1c3-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_eabc62a-framejs_902a6d8-globalBjs_2d41ef9-sugjs_97bfd68-wwwjs_8d1160b; MSA_WH=1433_772; BAIDUID=E9B0B6A35D4ABC6ED4891FCC0FD085BD:FG=1; plus_cv=1::m:2a9fb36a; H_WISE_SIDS=107504_106305_100040_100100_109550_104341_107937_108437_109700_109794_107961_108453_109737_109558_109506_110022_107895_107917_109683_109588_110072_107318_107300_107242_100457; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; rsv_jmp_slow=1474644236473; sug=3; sugstore=1; ORIGIN=0; bdime=21110; H_PS_645EC=60efFRJ1dM8ial205oBcDuRmtLgH3Q6NaRzxDuIkbMkGVXNSHmXBfW0GZL4l5pnj; BD_UPN=123253; BD_CK_SAM=1; BDSVRTM=110; H_PS_PSSID=17947", "Host":host, "Pragma":"no-cache", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", } elif client == 'wap': headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control":"no-cache", "Connection":"keep-alive", #"Cookie":"__cfduid=df26a7c536a0301ccf36481a14f53b4a81469608715; BIDUPSID=E9B0B6A35D4ABC6ED4891FCC0FD085BD; PSTM=1474352745; plus_cv=1::m:2a9fb36a; BDUSS=XNNMTJlWEdDdzFPdU1nSzVEZ1REYn4tNWNwZk94NVducXpaaThjWjE4bU1TQXRZQVFBQUFBJCQAAAAAAAAAAAEAAADLTBsKYTYzMTM4MTcwMgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIy741eMu-NXQ; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BDRCVFR[C0p6oIjvx-c]=mbxnW11j9Dfmh7GuZR8mvqV; BDRCVFR[uLXjBGr0i56]=mbxnW11j9Dfmh7GuZR8mvqV; lsv=globalTjs_97273d6-wwwTcss_bf2b167-routejs_6ede3cf-activityControllerjs_b6f8c66-wwwBcss_9f22dd4-framejs_38dd0ce-globalBjs_1c30bc8-sugjs_e1176fe-wwwjs_9f21ca8; H_WISE_SIDS=102065_100040_109672_102432_107851_109607_104340_106264_110031_108437_109699_107960_108453_109738_110201_110022_107896_109683_109668_109588_108013_107320_107242; MSA_WH=1433_216; MSA_PBT=92; MSA_ZOOM=1000; BAIDUID=8ADD01F376F3A0D29ED11B9D017537E9:FG=1; wpr=0; BDICON=10123156", "Host":host, "Pragma":"no-cache", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", } elif client == 'xgss': headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control":"no-cache", "Connection":"keep-alive", #"Cookie":"BAIDUID=9D18417A1E66FC499DB4DDDCA3CB2914:FG=1; PSTM=1482157958; BIDUPSID=9D18417A1E66FC499DB4DDDCA3CB2914; BDUSS=kRqbnhGeDBaSH5HelVKfktVT1NpQk1HcndFclB-VzEzdExFLVBpNFJoemhnb0ZZSVFBQUFBJCQAAAAAAAAAAAEAAADZAEZLU8ewz99TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOH1WVjh9VlYS; ispeed_lsm=0; BDSFRCVID=bI-sJeCCxG3G87biS4_bj3Pvv3CwjeFp3e4p3J; H_BDCLCKID_SF=JJF8oDDhJKvbfP0kKJo5MbLt2frXetJyaR3thpQbWJ5TMC_wh4RcLtCt0UcbQUrO5Dbz0l7wQl38ShPC-tnZX68h5tTgJPKO5m8j2Cns3l02V-jIe-t2ynQDDxJma4RMW20jWl7mWU5jVKFljTu2j5c0eUbX-I6E2I6yQnT8HJOoDDvPyDc5y4LdLp7xJM-OJKr0bKb7KJF5V-QCDUbbM4LzKab-tn8eWJQ2QJ8BJC0MMI3P; pgv_pvi=1762722816; pgv_si=s4606412800; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_UPN=123253; sug=3; sugstore=0; ORIGIN=0; bdime=20100; H_PS_645EC=59ceduqR707UyvU6gKhGPNOVRJs1nF13nLKncP7DqEfiKlajBDdAj48fqg%2BC3hFY; BDRCVFR[FYP17ZXncD_]=mk3SLVN4HKm; BD_CK_SAM=1; PSINO=5; BDSVRTM=79; H_PS_PSSID=", "Host":"www.baidu.com", "Pragma":"no-cache", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", } else: print '查询客户端参数错误!!!!!' # 代理服务器 proxyHost = "proxy.abuyun.com" proxyPort = "9010" # 代理隧道验证信息 proxyUser = "HJQ044255HV9182P" proxyPass = "0B43E0B5352C5033" proxyMeta = "https://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host" : proxyHost, "port" : proxyPort, "user" : proxyUser, "pass" : proxyPass, } proxies = { "http" : proxyMeta, "https" : proxyMeta, } html = requests.get(url,headers=headers,timeout=30) code = html.encoding return html.content def date(timeStamp): timeArray = time.localtime(timeStamp) otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) return otherStyleTime def getContent(word,client): #查百度PC排名 if client == 'pc': pcurl = 'https://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % pcurl html = getHTml(pcurl,client) html_dict = json.loads(html) for tag in html_dict['feed']['entry']: if tag.has_key('title'): title = tag['title'] url = tag['url'] rank = tag['pn'] time = date(tag['time']) outfile.write('%s,%s,%s,%s,%s\n' % (word,rank,url,title,time)) print rank,url return 1 #查百度移动排名 elif client == 'wap': wapurl = 'https://m.baidu.com/s?pn=0&usm=2&word=%s&sa=np' % word print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % wapurl html = getHTml(wapurl,client) tree = etree.HTML(html) div = tree.xpath('//*[@id="results"]/div') # 通过xpath路径提取元素 for line in div: line_html = etree.tostring(line) #通过etree.tostring方法得到一个html print line_html title = re.sub('<[^>]*?>','',search(r'<h3 class="c-title[^>]*?>([\s\S]*?)</h3>',line_html)) rank = search(r'order="(\d+)"',line_html) domain = search(r'<div class="c-showurl c-line-clamp1"><span[^>]*?>(.*?)</span>',line_html) if domain == 'no': domain = search(r'<div class="c-showurl">(.*?)\s+\d+k</div>',line_html) if domain == 'no': domain = search(r'<span class="c-color-url">(.*?)</span>',line_html) if domain == 'no': domain = search(r'<div class="c-color-url">(.*?)</div>',line_html) if domain == 'no': domain = search('<span class="site">(.*?)</span>',line_html) if domain == 'no': domain = search(r'<div class="c-showurl c-line-clamp1">(.*?) \d+k<span',line_html) if domain == 'no': domain = '搜索特型' print rank,domain outfile.write('%s,%s,%s\n' % (word,rank,domain)) return 1 #获取pc百度相关搜索词 elif client == 'xgss': print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@' % word url = 'https://www.baidu.com/s?wd=%s&tn=baidurs2top' % word try: html = getHTml(url,client) for i in html.split(','): print i outfile.write('"%s","%s"\n' % (word,i)) except: print 'Error' #查pc百度收录数据 elif client == "shoulu": print "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ start crawl %s @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" % word url = "https://www.baidu.com/s?wd=%s&pn=0&rn=1&tn=json" % word html = getHTml(url,client) html_dict = json.loads(html) if html_dict['feed']['entry'] == [{}]: include = "未收录" else: line = html_dict['feed']['entry'][0] link = line["url"] date = line["time"] include = date print url,include outfile.write("%s,%s\n" % (url,include)) else: return 'Error' words = open(wordfile).readlines() pool = multiprocessing.Pool(processes=3) for word in words: word = word.strip() pool.apply_async(getContent, (word,client )) pool.close() pool.join()
文章来源:GOGO闯 www.kaopuseo.com