seo小需求python实现

带个的装备去打怪刷副本,起码不被秒;继续python与seo那点事
黄昏
遍历同目录下所有xml文件并提取字段,拼接url

#encoding=utf-8
import os,re

def panduan(num):
	num=int(num)
	if num<10:
		nums='0'+str(num)
	else:
		nums=str(num)
	return nums


for xml in os.listdir('.'):
	if '.xml' in xml:
		f=open(xml).read()
		data=re.compile(r'<ID>(.*?)</ID>[\s\S]*?<mulu>(.*?)</mulu>[\s\S]*?<AddTimes>(.*?)</AddTimes>')
		ziduan=re.findall(data,f)
		for i in ziduan:
			ID=i[0]
			mulu=i[1]
			years=(i[2].split(' ')[0]).split('/')[0]
			months=(i[2].split(' ')[0]).split('/')[1]
			days=(i[2].split(' ')[0]).split('/')[2]
			shijianchuo=years+panduan(months)+panduan(days)
			print ('http://www.studyems.com/%s/%s/ems%s.html')%(mulu,shijianchuo,ID)

定期更新采集

#coding:utf-8
import urllib2,re,pycurl,StringIO,sys,lxml,requests,time
from bs4 import BeautifulSoup
str_time=time.strftime('%Y-%m-%d',time.localtime())
op_txt=open('url.txt','a')
url = 'http://www.pincai.com/sitemap/group1.htm'
html=requests.get(url).content
soup = BeautifulSoup(html,"lxml")
zidian={}
c=0
with open('url.txt') as f:
    for i in f.readlines():
        i=i.strip()
        zidian['%s'%(i)]=c
        c+=1


for urllist in re.findall(re.compile(r'<li>.*?href="(.*?)" target="_blank">(.*?)</a></li>'),str(soup)):	
    url_data=urllist[0].strip()
    title=urllist[1]
    if '2017' in title:	
        print title,url_data
        if zidian.has_key(url_data):
            print u'没有更新'+str_time
            continue
        else:
            print u'成功更新'+str_time
            op_txt.writelines('%s\n'%url_data)


# url="http://www.kanzhun.com/k-news/"
# html=urllib2.urlopen(url).read()
# #print html
# for urllist in re.findall('<li><a href="(.*?)">(.*?)</a></li>',html):
#     #print urllist[0],urllist[1]
#     if '春节' in urllist[1]:
#         print urllist[1],urllist[0]

v1版本加介词for

# -*- coding: utf-8 -*-
#build by bigwayseo.com
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')

l=['iphone','samsung','sony','galaxy','xperia']
op_txt=open('done.txt','a')

class NaiveFilter():

    def __init__(self):
        self.keywords = set([])

    def parse(self, path):
        for keyword in open(path):
            self.keywords.add(keyword.strip().encode('utf-8').lower())
        # print self.keywords

    def filter(self, message):
        apple='yes'
        message = unicode(message).lower()
        for k in self.keywords:
            replss=r"for %s"%k
            if k in message and replss not in message:
                for i in l:
                    c=i+' '+k
                 
                    if c not in message:
                        message=message.replace(k, replss)

                    else:
                        f=r'for %s'%c
                        message=message.replace(k, f)
                    print message
                    apple='no'
                    break
          

            if apple=='no':
                break

        op_txt.write('%s\n'%message)
        # print message


if __name__ == '__main__':
    f = NaiveFilter()
    f.parse("brands.txt")                                    #brands.txt里面放要敏感词或不想要的词等
    a=[i.strip() for i in open('word.txt').readlines()]      #word.txt是将要过滤的词库
    c=len(a)
    for i in range(c):
        f.filter(a[i])

待续、、、

发表评论

电子邮件地址不会被公开。