应对采集需要登陆的网站小技巧,原理:通过cookie保持登陆的机制来实现登陆。
虽说一直折腾百度seo,但是也有做电商,特别是B2B阿里巴巴国际站,为了满足个人工作上的一些需求:自制了一些小脚本,采集阿里国际站后台数据:备份一下..
采集我的词
#encoding=utf-8 import requests,re,time op_csv=open('myword.csv','a') op_csv.write('关键词,曝光量,点击量\n') for i in range(1,1001): #url估计也要换下 url='https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getKeywordsAndHasP4P&ctoken=cypr24i30ehf&statisticsType=month&orderBy=sumShowCnt&orderModel=desc&pageSize=10&pageNO=%d'%i headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'} cookies={'cookie':'你的cookie'} html =requests.post(url=url,cookies=cookies,headers=headers,timeout=10) # print requests['QUERY_STRING'] c=html.json() b=c['value']['keywords']['data'] for i in b: print i['keyword'],i['sumShowCnt'],i['sumClickCnt'] op_csv.write('%s,%s,%s\n'%(i['keyword'],i['sumShowCnt'],i['sumClickCnt']))
采集访客词
#encoding=utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8') import requests,re op_csv=open('visitword.csv','a') for i in range(1,301): url='https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getVisitors&ctoken=cypr24i30ehf&pageSize=10&pageNO=%d&startDate=2016-03-26&endDate=2016-04-24'%i headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'} cookies={'cookie':'你的cookie'} html=requests.post(url=url,cookies=cookies,headers=headers,timeout=10) c=html.json() b=c['value']['data'] for i in b: print i['serKeywords'] op_csv.write('%s\n'%i['serKeywords'])
采集询盘词
#encoding=utf-8 import sys reload(sys) sys.setdefaultencoding( "utf-8" ) import requests op_csv_write=open('p4p.csv','a') op_csv_write.write('关键词,p4pViews,p4pClicks,views,clicks\n') for num in range(1,20): url='https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getEffectiveProductsAndStats&ctoken=d1uvlnsn7bj3&statisticsType=month&pageNO=%d'%num cookies={'Cookie':'你的cookie'} html=requests.get(url=url,cookies=cookies) datajson=html.json() # print datajson cc=datajson['value'] # print cc a=cc['products'] b=a['data'] # print b for i in b: f=i['keywordEffect'] for n in f: a0=n['keyword'] a1=n['p4pViews'] a2=n['p4pClicks'] a3=n['views'] a4=n['clicks'] print a0,a1,a2,a3,a4 op_csv_write.write('%s,%s,%s,%s,%s\n'%(a0,a1,a2,a3,a4))
采集热门搜索词
#encoding=utf-8 import requests import json from lxml import etree from multiprocessing.dummy import Pool import sys reload(sys) sys.setdefaultencoding( "utf-8" ) cook={'cookie':'你的cookie'} # url='https://www2.alibaba.com/manage_ad_keyword.htm' # html=requests.get(url).content # print html op_csv_write=open('aliword.csv','a') op_csv_write.write('关键词,卖家竞争度,橱窗数,搜索热度7月\n') for i in xrange(1,501): # url='https://hz.my.data.alibaba.com/industry/.json?action=CommonAction&iName=searchKeywords&ctoken=15m0g7t10hi17&keywords=hair&orderBy=srh_pv_this_mon&orderModel=desc&pageNO=%d'%i url='https://hz.my.data.alibaba.com/industry/.json?action=CommonAction&iName=searchKeywords&0.6011645244434476&ctoken=1ek_faad2506u&keywords=hair&orderBy=srh_pv_this_mon&orderModel=desc&pageSize=10&pageNO=%d'%i html=requests.get(url,cookies=cook).content # print html jsDict=json.loads(html) #jscontent是需要解析的js代码 # print jsDict jsData=jsDict['value'] # print jsData comments=jsData['data'] for each in comments: # print each print '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'%(each['keywords'],each['company_cnt'],each['showwin_cnt'],each['srh_pv_last_10mon'],each['srh_pv_last_11mon'],each['srh_pv_last_1mon'],each['srh_pv_last_2mon'],each['srh_pv_last_3mon'],each['srh_pv_last_4mon'],each['srh_pv_last_5mon'],each['srh_pv_last_6mon'],each['srh_pv_last_7mon'],each['srh_pv_last_8mon'],each['srh_pv_last_9mon'],each['srh_pv_this_mon']) op_csv_write.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'%(each['keywords'],each['company_cnt'],each['showwin_cnt'],each['srh_pv_last_10mon'],each['srh_pv_last_11mon'],each['srh_pv_last_1mon'],each['srh_pv_last_2mon'],each['srh_pv_last_3mon'],each['srh_pv_last_4mon'],each['srh_pv_last_5mon'],each['srh_pv_last_6mon'],each['srh_pv_last_7mon'],each['srh_pv_last_8mon'],each['srh_pv_last_9mon'],each['srh_pv_this_mon']))
采集p4p竞价词
#encoding=utf-8 import requests import json # import sys # reload(sys) # sys.setdefaultencoding('utf-8') op_word_csv=open('okey.csv','a') op_word_csv.write('关键词,状态,搜索量\n') for n in xrange(1,190): k=str(n) # print type(c) url='https://www2.alibaba.com/asyGetAdKeyword.do?_t=1440205849059&cmd=showTable&ctoken=dnb2amfj9a86&json=%7B%22count%22%3A50%2C%22date%22%3A%227%22%2C%22delayShow%22%3Afalse%2C%22page%22%3A'+k+'%2C%22recStrategy%22%3A1%2C%22recType%22%3A%22recommend%22%2C%22sort%22%3A%22asc%22%2C%22sortKey%22%3A%22keyword%22%2C%22tagId%22%3A%2250191900149%22%7D&_csrf_token_=14z1d7pfbefjg' cook={'cookies':'你的cookie'} html=requests.post(url=url,cookies=cook).content # print html json1=json.loads(html) # print json1 jsdata=json1['keywords'] # print jsdata for i in jsdata: # print i b=i['adKeywordDO'] c=b['status'] e=b['word'] f=i['search'] print '%s,%s,%s\n'%(e,c,f) op_word_csv.write('%s,%s,%s\n'%(e,c,f))
恭喜你很认真的看完了,友情提示:采集百度凤巢也可以这样玩!