【seo工具】关键词挖掘,获取关键词数据比较基础的部分,渠道很多,这次我们来采集凤巢的关键词数据,对关键词的分析,以及使用才是重点!我们先解决基础部分:
#coding:utf-8 import pycurl,StringIO,json,urllib,re,time,random,csv #找到返回json数据的链接 url = 'https://fengchao.baidu.com/nirvana/request.ajax?path=jupiter/GET/kr/word&reqid=1473662256226_43' # filename = raw_input('input your filename\t') #轮换ua def getUA():#随机取ua uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36' ] newUa = random.choice(uaList) return newUa #构造头部信息 headers = [ 'Accept:*/*', 'Accept-Encoding:gzip, deflate', 'Accept-Language:zh-CN,zh;q=0.8', 'Connection:keep-alive', # 'Content-Length:857', 'Content-Type:application/x-www-form-urlencoded', # 'Cookie:-----自定义cookie--------', 'Cookie:FC-FE-TERMINUS=fc_terminus_user; PSTM=1470278993; BIDUPSID=68D179B9795C9500BE7ECCE65F4DABDE; __cfduid=d76a2eae0d2d244e95526665c082a83c21470281708; BAIDUID=D845C1483B574B75268F3B55DD7C3E99:FG=1; BDUSS=RQQkxEOE5XNVZEdlBjTnpiTVQwdHI1YX5IdDJnQkJ-UnBvMEMtRmpuTjFqUDFYQUFBQUFBJCQAAAAAAAAAAAEAAAABgNQ2Qmlnd2F5c2VvAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHX~1Vd1~9VXUT; H_PS_PSSID=1457_18280_21097_20856_20732; SFSSID=1854195357ed9983fd81f60449bb8f68; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02236744899; uc_login_unique=2fd154d0e97cc43a168b297ce0a3b280; __cas__st__3=0bafc4a741efb26d56acf2af8ec6b681db29020e1105f6d9b48086a98f6689d9cd346297babc34f158f94392; __cas__id__3=21291948; __cas__rn__=223674489; SAMPLING_USER_ID=21291948', 'Host:fengchao.baidu.com', 'Origin:https://fengchao.baidu.com', 'Referer:https://fengchao.baidu.com/nirvana/main.html?userid=21291948', #'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'User-Agent: %s' %getUA(), ] #打开fengchao.csv用来保存关键词数据 fengchao = open('fengchao.csv','a') fengchao.write('关键词,Total,PC,Mobile\n') # n=0 for query in [word.strip() for word in open('word.txt').readlines()]: # frist = 'Word:%s\tTotal\tPc\tMobile\n\n' % query #表头 # fengchao.write(frist) # n+=1 for pagelink in range(1,5): #构造需要post的data data = urllib.urlencode({ 'params': '{"entry":"kr_station","query":"%s","querytype":1,"pageNo":%d,"pageSize":300}' % (query,pagelink), 'source':'', 'path':'jupiter/GET/kr/word', 'userid':21291948, 'token':'0bafc4a741efb26d56acf2af8ec6b681db29020e1105f6d9b48086a56f6689d9cd346297babc34f158f94392', #'eventId':'1471855302096_47', #'reqId':'1471855302096_30', 'Name':'', }) time.sleep(1) c = pycurl.Curl() #通过curl方法构造一个对象 c.setopt(pycurl.FOLLOWLOCATION, True) #自动进行跳转抓取 c.setopt(pycurl.MAXREDIRS,5) #设置最多跳转多少次 c.setopt(pycurl.CONNECTTIMEOUT, 60) #设置链接超时 c.setopt(pycurl.TIMEOUT,120) #下载超时 c.setopt(pycurl.ENCODING, 'gzip,deflate') #处理gzip内容,有些傻逼网站,就算你给的请求没有gzip,它还是会返回一个gzip压缩后的网页 # c.setopt(c.PROXY,ip) # 代理 c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) #设置要访问的URL c.setopt(pycurl.HTTPHEADER,headers) #传入请求头 c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, data) #传入POST数据 c.setopt(c.WRITEFUNCTION, c.fp.write) #回调写入字符串缓存 c.perform() code = c.getinfo(c.HTTP_CODE) #返回状态码 html = c.fp.getvalue() #返回源代码 # print n for word,total,pc,mobile in re.findall(r'rd":"(.*?)","pv":(\d+),"pvPc":(\d+),"pvWise":(\d+),',html): print word,total,pc,mobile # fengchao.writelines(word + '\t' + total + '\t' + pc + '\t' + mobile + '\n') fengchao.writelines('%s,%s,%s,%s\n'%(word,total,pc,mobile)) fengchao.close()
脚本中很多地方是需要修改成自己的,注册一个凤巢的账号(免费的),抓包修改信息即可;
脚本写的有一段时间了,能不能用,不知道,思路清晰就好!
Python是万金油,可以利用到黑帽SEO方向去。