无细分,不需求,记录一下工作中python在seo中的应用!或者说懂技术的seo能折腾点啥
遍历同目录下所有xml文件并提取字段,拼接url
#encoding=utf-8 import os,re def panduan(num): num=int(num) if num<10: nums='0'+str(num) else: nums=str(num) return nums for xml in os.listdir('.'): if '.xml' in xml: f=open(xml).read() data=re.compile(r'<ID>(.*?)</ID>[\s\S]*?<mulu>(.*?)</mulu>[\s\S]*?<AddTimes>(.*?)</AddTimes>') ziduan=re.findall(data,f) for i in ziduan: ID=i[0] mulu=i[1] years=(i[2].split(' ')[0]).split('/')[0] months=(i[2].split(' ')[0]).split('/')[1] days=(i[2].split(' ')[0]).split('/')[2] shijianchuo=years+panduan(months)+panduan(days) print ('https://www.xxxxxx.com/%s/%s/ems%s.html')%(mulu,shijianchuo,ID)
定期更新采集
#coding:utf-8 import urllib2,re,pycurl,StringIO,sys,lxml,requests,time from bs4 import BeautifulSoup str_time=time.strftime('%Y-%m-%d',time.localtime()) op_txt=open('url.txt','a') url = 'https://www.pincai.com/sitemap/group1.htm' html=requests.get(url).content soup = BeautifulSoup(html,"lxml") zidian={} c=0 with open('url.txt') as f: for i in f.readlines(): i=i.strip() zidian['%s'%(i)]=c c+=1 for urllist in re.findall(re.compile(r'<li>.*?href="(.*?)" target="_blank">(.*?)</a></li>'),str(soup)): url_data=urllist[0].strip() title=urllist[1] if '2017' in title: print title,url_data if zidian.has_key(url_data): print u'没有更新'+str_time continue else: print u'成功更新'+str_time op_txt.writelines('%s\n'%url_data) # url="https://www.kanzhun.com/k-news/" # html=urllib2.urlopen(url).read() # #print html # for urllist in re.findall('<li><a href="(.*?)">(.*?)</a></li>',html): # #print urllist[0],urllist[1] # if '春节' in urllist[1]: # print urllist[1],urllist[0]
v1版本加介词for
# -*- coding: utf-8 -*- #build by bigwayseo.com import time import sys reload(sys) sys.setdefaultencoding('utf8') l=['iphone','samsung','sony','galaxy','xperia'] op_txt=open('done.txt','a') class NaiveFilter(): def __init__(self): self.keywords = set([]) def parse(self, path): for keyword in open(path): self.keywords.add(keyword.strip().encode('utf-8').lower()) # print self.keywords def filter(self, message): apple='yes' message = unicode(message).lower() for k in self.keywords: replss=r"for %s"%k if k in message and replss not in message: for i in l: c=i+' '+k if c not in message: message=message.replace(k, replss) else: f=r'for %s'%c message=message.replace(k, f) print message apple='no' break if apple=='no': break op_txt.write('%s\n'%message) # print message if __name__ == '__main__': f = NaiveFilter() f.parse("brands.txt") #brands.txt里面放要敏感词或不想要的词等 a=[i.strip() for i in open('word.txt').readlines()] #word.txt是将要过滤的词库 c=len(a) for i in range(c): f.filter(a[i])
下载sitemap的压缩文件
#encoding=utf-8 import requests print "downloading with requests" for num in range(2,1018): url = 'https://www.xxxxx.com/s/baidu_sitemap%d.txt.gz'%num r = requests.get(url) with open("C:\Users\Administrator\Desktop\sitemap\sitemap%d.zip"%num, "wb") as code: code.write(r.content)
合并日志文件
#coding=utf-8 import os import sys import glob def dirTxtToLargeTxt(dir,outputFileName): '''从dir目录下读入所有的TXT文件,将它们写到outputFileName里去''' #如果dir不是目录返回错误 if not os.path.isdir(dir): print "传入的参数有错%s不是一个目录" %dir return False #list all txt files in dir outputFile = open(outputFileName,"a") for txtFile in glob.glob(os.path.join(dir,"*.txt")): print txtFile inputFile = open(txtFile,"rb") for line in inputFile: outputFile.write(line) return True if __name__ =="__main__": if len(sys.argv) < 3: print "Usage:%s dir outputFileName" %sys.argv[0] sys.exit() dirTxtToLargeTxt(sys.argv[1],sys.argv[2])
重命名一个目录下所有文件夹下的文件名
#encoding=utf-8 import os,sys reload(sys) sys.setdefaultencoding('utf-8') # path = 'C:\Users\Administrator\Desktop\image\\' for i in os.listdir('C:\Users\Administrator\Desktop\image'): f=1 img_dir='C:\Users\Administrator\Desktop\image\%s\\'%i # print img_dir for n in os.listdir(img_dir): pic_name=n.decode('gbk') # .decode('gbk') new_name=img_dir+i+'(%s).jpg'%f path=img_dir+pic_name print path os.rename(path, new_name) f+=1 print u"重命名成功"
关键词去重
wen1_dict={} c=0 for wen1_line in open('wen1.txt'): wen1=wen1_line.strip() wen1_dict['%s'%(wen1)]=c c+=1 for i in range(1,10): i=str(i) if wen1_dict.has_key(i): continue else: print i
待续…
合格的seoer,具有多方面的知识积累。