自制小脚本for alibaba

应对采集需要登陆的网站小技巧,原理:通过cookie保持登陆的机制来实现登陆。
old photo
虽说一直折腾百度seo,但是也有做电商,特别是B2B阿里巴巴国际站,为了满足个人工作上的一些需求:自制了一些小脚本,采集阿里国际站后台数据:备份一下..
采集我的词

#encoding=utf-8

import requests,re,time

op_csv=open('myword.csv','a')
op_csv.write('关键词,曝光量,点击量\n')
for i in range(1,1001):
        #url估计也要换下
	url='https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getKeywordsAndHasP4P&ctoken=cypr24i30ehf&statisticsType=month&orderBy=sumShowCnt&orderModel=desc&pageSize=10&pageNO=%d'%i 
	headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'}
	cookies={'cookie':'你的cookie'}
	html =requests.post(url=url,cookies=cookies,headers=headers,timeout=10)
	# print requests['QUERY_STRING']

	c=html.json()
	b=c['value']['keywords']['data']
	for i in b:
		print i['keyword'],i['sumShowCnt'],i['sumClickCnt']
		op_csv.write('%s,%s,%s\n'%(i['keyword'],i['sumShowCnt'],i['sumClickCnt']))

采集访客词

#encoding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests,re
op_csv=open('visitword.csv','a')

for i in range(1,301):

	url='https://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getVisitors&ctoken=cypr24i30ehf&pageSize=10&pageNO=%d&startDate=2016-03-26&endDate=2016-04-24'%i

	headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'}
	cookies={'cookie':'你的cookie'}
	html=requests.post(url=url,cookies=cookies,headers=headers,timeout=10)
	c=html.json()
	b=c['value']['data']
	for i in b:
		print i['serKeywords']
		op_csv.write('%s\n'%i['serKeywords'])

采集询盘词

#encoding=utf-8
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" ) 
import requests

op_csv_write=open('p4p.csv','a')
op_csv_write.write('关键词,p4pViews,p4pClicks,views,clicks\n')
for num in range(1,20):
    url='http://hz-mydata.alibaba.com/self/.json?action=CommonAction&iName=getEffectiveProductsAndStats&ctoken=d1uvlnsn7bj3&statisticsType=month&pageNO=%d'%num
    cookies={'Cookie':'你的cookie'}
    html=requests.get(url=url,cookies=cookies)
    datajson=html.json()
    # print datajson
    cc=datajson['value']
    # print cc
    a=cc['products']
    b=a['data']
    # print b
    for i in b:
        f=i['keywordEffect']
        for n in f:
            a0=n['keyword']
            a1=n['p4pViews']
            a2=n['p4pClicks']
            a3=n['views']
            a4=n['clicks']
            print a0,a1,a2,a3,a4
            op_csv_write.write('%s,%s,%s,%s,%s\n'%(a0,a1,a2,a3,a4))

采集热门搜索词

#encoding=utf-8
import requests
import json
from lxml import etree
from multiprocessing.dummy import Pool

import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

cook={'cookie':'你的cookie'}

# url='http://www2.alibaba.com/manage_ad_keyword.htm'
# html=requests.get(url).content
# print html
op_csv_write=open('aliword.csv','a')
op_csv_write.write('关键词,卖家竞争度,橱窗数,搜索热度7月\n')
for i in xrange(1,501):
    # url='http://hz.my.data.alibaba.com/industry/.json?action=CommonAction&iName=searchKeywords&ctoken=15m0g7t10hi17&keywords=hair&orderBy=srh_pv_this_mon&orderModel=desc&pageNO=%d'%i
    url='http://hz.my.data.alibaba.com/industry/.json?action=CommonAction&iName=searchKeywords&0.6011645244434476&ctoken=1ek_faad2506u&keywords=hair&orderBy=srh_pv_this_mon&orderModel=desc&pageSize=10&pageNO=%d'%i
    html=requests.get(url,cookies=cook).content
    # print html

    jsDict=json.loads(html) #jscontent是需要解析的js代码
    # print jsDict
    jsData=jsDict['value']
    # print jsData
    comments=jsData['data']
    for each in comments:
        # print each
        print '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'%(each['keywords'],each['company_cnt'],each['showwin_cnt'],each['srh_pv_last_10mon'],each['srh_pv_last_11mon'],each['srh_pv_last_1mon'],each['srh_pv_last_2mon'],each['srh_pv_last_3mon'],each['srh_pv_last_4mon'],each['srh_pv_last_5mon'],each['srh_pv_last_6mon'],each['srh_pv_last_7mon'],each['srh_pv_last_8mon'],each['srh_pv_last_9mon'],each['srh_pv_this_mon'])
        op_csv_write.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n'%(each['keywords'],each['company_cnt'],each['showwin_cnt'],each['srh_pv_last_10mon'],each['srh_pv_last_11mon'],each['srh_pv_last_1mon'],each['srh_pv_last_2mon'],each['srh_pv_last_3mon'],each['srh_pv_last_4mon'],each['srh_pv_last_5mon'],each['srh_pv_last_6mon'],each['srh_pv_last_7mon'],each['srh_pv_last_8mon'],each['srh_pv_last_9mon'],each['srh_pv_this_mon']))

采集p4p竞价词

#encoding=utf-8
import requests
import json

# import sys
# reload(sys)
# sys.setdefaultencoding('utf-8')

op_word_csv=open('okey.csv','a')
op_word_csv.write('关键词,状态,搜索量\n')
for n in xrange(1,190):

    k=str(n)
    # print type(c)


    url='http://www2.alibaba.com/asyGetAdKeyword.do?_t=1440205849059&cmd=showTable&ctoken=dnb2amfj9a86&json=%7B%22count%22%3A50%2C%22date%22%3A%227%22%2C%22delayShow%22%3Afalse%2C%22page%22%3A'+k+'%2C%22recStrategy%22%3A1%2C%22recType%22%3A%22recommend%22%2C%22sort%22%3A%22asc%22%2C%22sortKey%22%3A%22keyword%22%2C%22tagId%22%3A%2250191900149%22%7D&_csrf_token_=14z1d7pfbefjg'

    cook={'cookies':'你的cookie'}

    html=requests.post(url=url,cookies=cook).content
    # print html

    json1=json.loads(html)
    # print json1
    jsdata=json1['keywords']
    # print jsdata

    for i in jsdata:
        # print i
        b=i['adKeywordDO']
        c=b['status']
        e=b['word']
        f=i['search']
        print '%s,%s,%s\n'%(e,c,f)
        op_word_csv.write('%s,%s,%s\n'%(e,c,f))

恭喜你很认真的看完了,友情提示:采集百度凤巢也可以这样玩!

发表评论

电子邮件地址不会被公开。