Python批量挖掘百度凤巢关键词数据

【seo工具】关键词挖掘,获取关键词数据比较基础的部分,渠道很多,这次我们来采集凤巢的关键词数据,对关键词的分析,以及使用才是重点!我们先解决基础部分:

  • 每次使用脚本需抓包获取cookies数据登陆百度凤巢后台
  • 找到返回json关键词数据的链接作为请求对象
  • 构造header信息,假装你是个人类
  • 构造data信息,用作post
  • 使用pycurl模块请求,使用json模块把返回的json数据可识别,这边直接用正则提取
  • 写入到csv文件当中,就酱了
  • #coding:utf-8
    
    import pycurl,StringIO,json,urllib,re,time,random,csv
    
    #找到返回json数据的链接
    url = 'http://fengchao.baidu.com/nirvana/request.ajax?path=jupiter/GET/kr/word&reqid=1473662256226_43'
    
    # filename = raw_input('input your filename\t')
    #轮换ua
    def getUA():#随机取ua
        uaList = [
            'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
            'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',
            'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; rv:44.0) Gecko/20100101 Firefox/44.0',
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
        ]
        newUa = random.choice(uaList)
        return newUa
    
    #构造头部信息
    headers = [
    	'Accept:*/*',
    	'Accept-Encoding:gzip, deflate',
    	'Accept-Language:zh-CN,zh;q=0.8',
    	'Connection:keep-alive',
    	# 'Content-Length:857',
    	'Content-Type:application/x-www-form-urlencoded',
    	# 'Cookie:-----自定义cookie--------',
    	'Cookie:FC-FE-TERMINUS=fc_terminus_user; PSTM=1470278993; BIDUPSID=68D179B9795C9500BE7ECCE65F4DABDE; __cfduid=d76a2eae0d2d244e95526665c082a83c21470281708; BAIDUID=D845C1483B574B75268F3B55DD7C3E99:FG=1; BDUSS=RQQkxEOE5XNVZEdlBjTnpiTVQwdHI1YX5IdDJnQkJ-UnBvMEMtRmpuTjFqUDFYQUFBQUFBJCQAAAAAAAAAAAEAAAABgNQ2Qmlnd2F5c2VvAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHX~1Vd1~9VXUT; H_PS_PSSID=1457_18280_21097_20856_20732; SFSSID=1854195357ed9983fd81f60449bb8f68; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a02236744899; uc_login_unique=2fd154d0e97cc43a168b297ce0a3b280; __cas__st__3=0bafc4a741efb26d56acf2af8ec6b681db29020e1105f6d9b48086a98f6689d9cd346297babc34f158f94392; __cas__id__3=21291948; __cas__rn__=223674489; SAMPLING_USER_ID=21291948',
    	'Host:fengchao.baidu.com',
    	'Origin:http://fengchao.baidu.com',
    	'Referer:http://fengchao.baidu.com/nirvana/main.html?userid=21291948',
    	#'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
    	'User-Agent: %s' %getUA(),
    ]
    
    
    #打开fengchao.csv用来保存关键词数据
    fengchao = open('fengchao.csv','a')
    fengchao.write('关键词,Total,PC,Mobile\n')
    # n=0
    for query in [word.strip() for word in open('word.txt').readlines()]:
    
    	# frist = 'Word:%s\tTotal\tPc\tMobile\n\n' % query #表头
    	# fengchao.write(frist)
    	# n+=1
    	for pagelink in range(1,5):
                    #构造需要post的data
    		data = urllib.urlencode({
    				'params': '{"entry":"kr_station","query":"%s","querytype":1,"pageNo":%d,"pageSize":300}' % (query,pagelink),
    				'source':'',
    				'path':'jupiter/GET/kr/word',
    				'userid':21291948,
    				'token':'0bafc4a741efb26d56acf2af8ec6b681db29020e1105f6d9b48086a56f6689d9cd346297babc34f158f94392',
    				#'eventId':'1471855302096_47',
    				#'reqId':'1471855302096_30',
    				'Name':'',
    			})
    		time.sleep(1)
    		c = pycurl.Curl()	#通过curl方法构造一个对象
    		c.setopt(pycurl.FOLLOWLOCATION, True)	#自动进行跳转抓取
    		c.setopt(pycurl.MAXREDIRS,5)			#设置最多跳转多少次
    		c.setopt(pycurl.CONNECTTIMEOUT, 60)		#设置链接超时
    		c.setopt(pycurl.TIMEOUT,120)			#下载超时
    		c.setopt(pycurl.ENCODING, 'gzip,deflate')	#处理gzip内容,有些傻逼网站,就算你给的请求没有gzip,它还是会返回一个gzip压缩后的网页
    		# c.setopt(c.PROXY,ip)	# 代理
    		c.fp = StringIO.StringIO()	
    		c.setopt(pycurl.URL, url)	#设置要访问的URL
    		c.setopt(pycurl.HTTPHEADER,headers)		#传入请求头
    		c.setopt(pycurl.POST, 1)
    		c.setopt(pycurl.POSTFIELDS, data)		#传入POST数据
    		c.setopt(c.WRITEFUNCTION, c.fp.write)	#回调写入字符串缓存
    		c.perform()		
    
    		code = c.getinfo(c.HTTP_CODE)	#返回状态码
    		html = c.fp.getvalue()	#返回源代码
    		# print n
    
    		for word,total,pc,mobile in re.findall(r'rd":"(.*?)","pv":(\d+),"pvPc":(\d+),"pvWise":(\d+),',html):
    			print word,total,pc,mobile
    			# fengchao.writelines(word + '\t' + total + '\t' + pc + '\t' + mobile + '\n')
    			fengchao.writelines('%s,%s,%s,%s\n'%(word,total,pc,mobile))
    
    fengchao.close()
    

    脚本中很多地方是需要修改成自己的,注册一个凤巢的账号(免费的),抓包修改信息即可;
    脚本写的有一段时间了,能不能用,不知道,思路清晰就好!

    发表评论

    电子邮件地址不会被公开。