python与seo需求

黄昏

  • 披上神装去打怪刷副本,站着撸,尤其像龙之谷这样需要大量刷副本和巢穴的游戏
  • 再如打王者荣耀,咬牙切齿,不会因为铭文差一点攻击而无法击杀对面英雄
  • 而seo这样无节操的工作,想要快速提出需求,解决问题,提升流量,懂一门脚本语言会更加如鱼得水
  • 当然seo圈很多鲜活的例子证明,只会火车头以及cms也能年入几十万,油费还是很够的
  • 最后学啥语言好,当然是python,理由是什么,学python还需理由?我也不知道在说啥


  • 无细分,不需求,记录一下工作中python在seo中的应用!或者说懂技术的seo能折腾点啥

    遍历同目录下所有xml文件并提取字段,拼接url

    #encoding=utf-8
    import os,re
    
    def panduan(num):
    	num=int(num)
    	if num<10:
    		nums='0'+str(num)
    	else:
    		nums=str(num)
    	return nums
    
    
    for xml in os.listdir('.'):
    	if '.xml' in xml:
    		f=open(xml).read()
    		data=re.compile(r'<ID>(.*?)</ID>[\s\S]*?<mulu>(.*?)</mulu>[\s\S]*?<AddTimes>(.*?)</AddTimes>')
    		ziduan=re.findall(data,f)
    		for i in ziduan:
    			ID=i[0]
    			mulu=i[1]
    			years=(i[2].split(' ')[0]).split('/')[0]
    			months=(i[2].split(' ')[0]).split('/')[1]
    			days=(i[2].split(' ')[0]).split('/')[2]
    			shijianchuo=years+panduan(months)+panduan(days)
    			print ('http://www.xxxxxx.com/%s/%s/ems%s.html')%(mulu,shijianchuo,ID)
    

    定期更新采集

    #coding:utf-8
    import urllib2,re,pycurl,StringIO,sys,lxml,requests,time
    from bs4 import BeautifulSoup
    str_time=time.strftime('%Y-%m-%d',time.localtime())
    op_txt=open('url.txt','a')
    url = 'http://www.pincai.com/sitemap/group1.htm'
    html=requests.get(url).content
    soup = BeautifulSoup(html,"lxml")
    zidian={}
    c=0
    with open('url.txt') as f:
        for i in f.readlines():
            i=i.strip()
            zidian['%s'%(i)]=c
            c+=1
    
    
    for urllist in re.findall(re.compile(r'<li>.*?href="(.*?)" target="_blank">(.*?)</a></li>'),str(soup)):	
        url_data=urllist[0].strip()
        title=urllist[1]
        if '2017' in title:	
            print title,url_data
            if zidian.has_key(url_data):
                print u'没有更新'+str_time
                continue
            else:
                print u'成功更新'+str_time
                op_txt.writelines('%s\n'%url_data)
    
    
    # url="http://www.kanzhun.com/k-news/"
    # html=urllib2.urlopen(url).read()
    # #print html
    # for urllist in re.findall('<li><a href="(.*?)">(.*?)</a></li>',html):
    #     #print urllist[0],urllist[1]
    #     if '春节' in urllist[1]:
    #         print urllist[1],urllist[0]
    
    

    v1版本加介词for

    # -*- coding: utf-8 -*-
    #build by bigwayseo.com
    import time
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    l=['iphone','samsung','sony','galaxy','xperia']
    op_txt=open('done.txt','a')
    
    class NaiveFilter():
    
        def __init__(self):
            self.keywords = set([])
    
        def parse(self, path):
            for keyword in open(path):
                self.keywords.add(keyword.strip().encode('utf-8').lower())
            # print self.keywords
    
        def filter(self, message):
            apple='yes'
            message = unicode(message).lower()
            for k in self.keywords:
                replss=r"for %s"%k
                if k in message and replss not in message:
                    for i in l:
                        c=i+' '+k
                     
                        if c not in message:
                            message=message.replace(k, replss)
    
                        else:
                            f=r'for %s'%c
                            message=message.replace(k, f)
                        print message
                        apple='no'
                        break
              
    
                if apple=='no':
                    break
    
            op_txt.write('%s\n'%message)
            # print message
    
    
    if __name__ == '__main__':
        f = NaiveFilter()
        f.parse("brands.txt")                                    #brands.txt里面放要敏感词或不想要的词等
        a=[i.strip() for i in open('word.txt').readlines()]      #word.txt是将要过滤的词库
        c=len(a)
        for i in range(c):
            f.filter(a[i])
    

    下载sitemap的压缩文件

    #encoding=utf-8
    import requests 
    print "downloading with requests"
    for num in range(2,1018):
        url = 'http://www.xxxxx.com/s/baidu_sitemap%d.txt.gz'%num
        r = requests.get(url) 
        with open("C:\Users\Administrator\Desktop\sitemap\sitemap%d.zip"%num, "wb") as code:
             code.write(r.content)
    

    合并日志文件

    #coding=utf-8
     
    import os
    import sys
    import glob
     
    def dirTxtToLargeTxt(dir,outputFileName):
        '''从dir目录下读入所有的TXT文件,将它们写到outputFileName里去'''
        #如果dir不是目录返回错误
     
        if not os.path.isdir(dir):
            print "传入的参数有错%s不是一个目录" %dir
            return False
        #list all txt files in dir
        outputFile = open(outputFileName,"a")
     
        for txtFile in glob.glob(os.path.join(dir,"*.txt")):
            print txtFile
            inputFile = open(txtFile,"rb")
     
            for line in inputFile:
                outputFile.write(line)
        return True
     
    if __name__ =="__main__":
        if len(sys.argv) < 3:
            print "Usage:%s dir outputFileName" %sys.argv[0]
            sys.exit()
        dirTxtToLargeTxt(sys.argv[1],sys.argv[2])
    

    重命名一个目录下所有文件夹下的文件名

    #encoding=utf-8
    import os,sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    # path = 'C:\Users\Administrator\Desktop\image\\'
    
    for i in os.listdir('C:\Users\Administrator\Desktop\image'):
        f=1
        img_dir='C:\Users\Administrator\Desktop\image\%s\\'%i
        # print img_dir
        for n in os.listdir(img_dir):
            pic_name=n.decode('gbk') # .decode('gbk')
            new_name=img_dir+i+'(%s).jpg'%f
            path=img_dir+pic_name
            print path
            os.rename(path, new_name)
            f+=1
            print u"重命名成功"
    

    关键词去重

    wen1_dict={}
    c=0
    for wen1_line in open('wen1.txt'):
        wen1=wen1_line.strip()
        wen1_dict['%s'%(wen1)]=c
        c+=1
    for i in range(1,10):
        i=str(i)
        if wen1_dict.has_key(i):
            continue
        else:
            print i
    

    待续…

    python与seo需求》上有1条评论

    发表评论

    电子邮件地址不会被公开。