Python实现关键词词库清洗,信息量不少,慢慢感受!已经涵盖很多python实用的语法、思路,套路等,能学会举一反一,其实已经在seo工具辅助上有比较大的帮助。不再一味依赖技术人员,无论在seo思想、seo技术上走得更加宽和深。
一、创建first.py
# coding:utf-8 '''第一步:凤巢关键词,排除包含空格,并保留searches>5且包含词根的关键词,输出到制定csv文件 凤巢关键词扩展处理视情况处理下,去除与词根完全没关联的词,后保存为“word-search”两列数据即可 inputfile == 凤巢关键词 cigen == 凤巢关键词需要包含的词,一般为上线频道名称 ''' import csv,sys,MySQLdb,sys,pypinyin,csv,os inputfile,cigen = sys.argv[1:3] csv.field_size_limit(sys.maxsize) reader = csv.reader(file(inputfile,'rb')) csvfile = open('fcword.txt','wb') word_list = [] print ">>> 开始过滤凤巢关键词......................................" for line in reader: #读取凤巢关键词,分离关键词和对应的搜索量 try: word = line[0] searches = line[1] if ' ' not in word and '的' not in word and cigen in word and int(searches)>5: #排除凤巢关键词中带空格,包含上线频道词根及搜索量大于词 term = '%s-%s' % (word,searches) word_list.append(term) except: continue print ">>> 关键词过滤完成......................................" print ">>> 过滤结果写入fcword.txt......................................" for term in list(set(word_list)): #过滤后关键词写入fcword.txt word = term.split('-')[0] searches = term.split('-')[1] data = [] data.append(word) data.append(searches) writer = csv.writer(csvfile,dialect='excel') writer.writerow(data) print ">>> fcword.txt写入完成......................................"
二、创建two.py
#coding:utf-8 '''第二步:获取包含制定词根(同凤巢扩展关键词使用词根一致)的上线词及详情''' import MySQLdb,sys,pypinyin,csv,os from pypinyin import pinyin, lazy_pinyin reload(sys) #重新加载sys sys.setdefaultencoding("utf-8") #解决ASCII啥啥啥的报错 cigen,py = sys.argv[1:3] csvfile1 = open('sqldata1.csv','wb') csvfile2 = open('sqldata2.csv','wb') csvfile3 = open('sqldata3.csv','wb') # 打开数据库连接 db = MySQLdb.connect("{ip}","{user}","{password}","{database}",charset="utf8") # 使用cursor()方法获取操作游标 cursor = db.cursor() ##bug修复,匹配范围改为所有频道 sql_z = 'select title,url from v9_zhuanti;' sql_l = 'select catname,url from v9_category;' sql_d = 'select title,url from v9_news' print ">>> 开始从mysql获取%s频带关键词数据......................................" % cigen print ">>> 获取专题关键词中......................................" # 提取包含词根的专题关键词 cursor.execute(sql_z) results = cursor.fetchall() nz = 0 for row in results: zhuanti_name = row[0] zhuanti_url = row[1] nz += 1 data = [] data.append(zhuanti_name) data.append('https://www.domain.com/%s' % zhuanti_url) writer = csv.writer(csvfile1,dialect='excel') writer.writerow(data) print ">>> 获取栏目关键词中......................................" # 提取包含词根的栏目关键词 cursor.execute(sql_l) results = cursor.fetchall() nl = 0 for row in results: lanmu_name = row[0] lanmu_url = row[1] nl += 1 data = [] data.append(lanmu_name) data.append('https://www.domain.com/%s' % lanmu_url) writer = csv.writer(csvfile2,dialect='excel') writer.writerow(data) print ">>> 获取详情关键词中......................................" # 提取包含词根的详情页 cursor.execute(sql_d) results = cursor.fetchall() nd = 0 for row in results: detail_name = row[0] detail_url = row[1] nd += 1 data = [] data.append(detail_name) data.append('https://www.domain.com/%s' % detail_url) writer = csv.writer(csvfile3,dialect='excel') writer.writerow(data) print ">>> 获取栏目:%s" % str(nl) print ">>> 获取专题:%s" % str(nz) print ">>> 获取详情:%s" % str(nd) #关闭数据库连接 db.close() print ">>> 关闭mysql连接......................................"
三、创建three.py
#coding:utf-8 '''第三部:将凤巢关键词与上线关键词进行对比,完整匹配的存入pipei_word.txt待去重,未匹配存入nopipei_word.txt待分词判断相关性来挑选合适的着陆页页面''' import csv,os,sys inputfile,cigen,py = sys.argv[1:4] os.system("python fcword.py %s %s" % (inputfile,cigen)) os.system("python sqldata.py %s %s" % (cigen,py)) os.system("cat sqldata*.csv > hebing.csv") os.system("rm sqldata*.csv") csv.field_size_limit(sys.maxsize) f = open('fcword.txt','r') p = open('pipei_word.txt','w') #创建完整匹配关键词存放文件 np = open('nopipei_word.txt','w') #创建未完整匹配关键词存放文件 print ">>> 开始将凤巢关键词与上线关键词进行对比......................................" w = 0 n = 0 for term in f: #读取凤巢关键词 term = term.strip() word = term.split(',')[0] searches = term.split(',')[1] panding = 'no' #匹配判定,若完整匹配一次,则panding = word reader = csv.reader(file('hebing.csv','rb')) for line in reader: word_name = line[0] word_url = line[1] if word == word_name: # 匹配的关键词存入'pipei_word.txt' panding = word p.write("%s,%s,%s\n" % (panding,searches,word_url)) n += 1 else: continue if panding == 'no': #未匹配关键词存入'nopipei-word.txt' np.write("%s,%s\n" % (word,searches)) w += 1 os.system("rm hebing.csv") print ">>> 匹配结束......................................" print ">>> 完整匹配关键词数:%s" % str(n) print ">>> 未匹配关键词数:%s" % str(w)
四、创建four.py
#coding:utf-8 '''第4步:对完整匹配的关键词pipei_word.py进行去重,保留字数最多的那一个''' import sys,os,pycurl,StringIO,random,re,threading from bs4 import BeautifulSoup as bs inputfile,cigen,py = sys.argv[1:4] os.system("python pipei.py %s %s %s" % (inputfile,cigen,py)) f = open('pipei_word.txt','r') fd = open('pipei_word_1.txt','w') def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ] ua = random.choice(uaList) return ua headers = [ "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Connection:keep-alive", "Host:www.domain.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.8.9", "User-Agent:%s" % getUA() ] def getHtml(url,headers): x = 0 while x < 10: x += 1 try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') #c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 content = c.fp.getvalue() # infoencode = chardet.detect(content).get('encoding','utf-8') # html = content.decode(infoencode,'ignore').encode(code) return content except: print "异常,重试>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" continue url_list = [] for line in f: line = line.strip() url_list.append(line) print ">>> 开始对同一个词被多次匹配的页面去重,保留正文字数最多的url......................................" #<!-- 执行爬虫功能的类 --> class getPic(threading.Thread): def __init__(self,url_list): threading.Thread.__init__(self) self.url_list = url_list self.timeout = 5 #<!-- 此处为具体的实现功能,按需修改此处 --> def downloadimg(self): for line in self.url_list: url = line.split(',')[2] html = getHtml(url,headers) if '<div id="Article">' in html: number = len(re.sub('<[^>]*?>','',str((bs(html)).find('div',{'id':'Article'})))) newline = '%s,%s' % (line,number) fd.write('%s\n' % newline) else: newline = '%s,%s' % (line,'0') fd.write('%s\n' % newline) def run(self): self.downloadimg() if __name__ == "__main__": getThreads = [] checkThreads = [] getPicThreads = [] #开启100线程,将url_list分成100份,每个线程运行1份 for i in range(5): t = getPic(url_list[((len(url_list)+4)/5) * i:((len(url_list)+4)/5) * (i+1)]) getPicThreads.append(t) for i in range(len(getPicThreads)): getPicThreads[i].start() for i in range(len(getPicThreads)): getPicThreads[i].join() print ">>> 完成" print ">>> 合并处理文件" os.system("rm pipei_word.txt") os.system("mv pipei_word_1.txt pipei_word.txt")
五、创建five.py
#coding:utf-8 ''' 第五步: 1、剩余未完整匹配上的凤巢搜索词跑下站内搜索,提取并计算‘搜索结果数’,‘整词召回数’,‘主词召回数’,根据以上指标判定该词是否生成专题页还是匹配相似详情页title ps:新增专题数量需要限制 ''' import sys,os,pycurl,StringIO,random,re,threading,urllib from bs4 import BeautifulSoup as bs cigen,py = sys.argv[1:3] f = open('nopipei_word.txt','r') zt = open('新增列表词.txt','w') wjg = open('无结果词.txt','w') xgt = open('detail匹配词.txt','w') def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ] ua = random.choice(uaList) return ua headers = [ "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Connection:keep-alive", "Host:www.domain.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.8.9", "User-Agent:%s" % getUA() ] def getHtml(url,headers): x = 0 while x < 10: x += 1 try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') #c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 content = c.fp.getvalue() # infoencode = chardet.detect(content).get('encoding','utf-8') # html = content.decode(infoencode,'ignore').encode(code) return content except: print "异常,重试>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" time.sleep(3) continue def search(req,html): text = re.search(req,html) if text: data = text.group(1) else: data = 'no' return data print ">>> 开始导入站内搜索url......................................" url_list = [] for line in f: line = line.strip() url = '{query_url}=%s' % urllib.quote(line.split(',')[0]) newline = '%s,%s' % (line,url) url_list.append(newline) print ">>> 完成" print ">>> 开始判定该词是否生成专题页还是匹配相似详情页title......................................" detail_word_list = [] #<!-- 执行爬虫功能的类 --> class getPic(threading.Thread): def __init__(self,url_list): threading.Thread.__init__(self) self.url_list = url_list self.timeout = 5 #<!-- 此处为具体的实现功能,按需修改此处 --> def downloadimg(self): for line in self.url_list: word = line.split(',')[0] searches = line.split(',')[1] url = line.split(',')[2] html = getHtml(url,headers) #jieguo = re.sub('\(.*?\)|&[^;]*?;','',search('<b>分词结果:</b>(.*?)<hr>',html)).replace('-','') panding = re.sub(cigen,'',word) n = 0 #主词召回 m = 0 #整词召回 title_list = re.findall('<a[^>]*?>(.*?)</a>',html) #获取搜索结果的title生成列表 for title in title_list: # 比如‘网络营销策划书’,计算搜索结果中包含‘网络营销’和‘网络营销策划书’的详情页个数 if word in title: m += 1 if panding in title: n += 1 if m == 0 or n == 0: # 计算包含完成word,即‘网络营销策划书’的召回率 ratio = '0' else: ratio = str(format(float(int(m))/float(int(n)),'.0%')).replace('%','') number = search('b>结果数量:</b>(\d+)&',html) #获取word搜索结果数量 if int(number) >= 10 and n >5 : # 生成专题的判定条件 zt.write("%s,%s\n" % (word,searches)) #print word,searches,number else: if number == '0': #无搜索结果的词存入‘无结果词’文件中 #print word,searches,number wjg.write("%s,%s\n" % (word,searches)) else: if int(searches) > 70 and int(number) >=10: #搜索结果>10且searches>70补加到生成专题词中 zt.write("%s,%s\n" % (word,searches)) #print word,searches,number else: detail = search(r"href='(https://www.domain.com/%s/.*?)'" % py,html) #需更改title的详情页 if detail not in detail_word_list: #print word,searches,number,detail if detail != 'no': #print word,searches,number xgt.write("%s,%s,%s\n" % (word,searches,detail)) #print word,searches,detail,number detail_word_list.append(detail) else: wjg.write("%s,%s\n" % (word,searches)) def run(self): self.downloadimg() if __name__ == "__main__": getThreads = [] checkThreads = [] getPicThreads = [] #开启100线程,将url_list分成100份,每个线程运行1份 for i in range(3): t = getPic(url_list[((len(url_list)+2)/3) * i:((len(url_list)+2)/3) * (i+1)]) getPicThreads.append(t) for i in range(len(getPicThreads)): getPicThreads[i].start() for i in range(len(getPicThreads)): getPicThreads[i].join() print '>>> 完成'
六、创建six.py
#coding:utf-8 '''第六步:词库清洗系统主程序,执行所有子程序并格式化结果文件''' import sys,os inputfile,cigen,py = sys.argv[1:4] os.system("python quchong.py %s %s %s" % (inputfile,cigen,py)) os.system("python fenci.py %s %s" % (cigen,py)) print "》》》数据统计:" print "新增列表词:" os.system("cat 新增列表词.txt|wc -l") print "detail匹配词:" os.system("cat detail匹配词.txt|wc -l") print "无结果词:" os.system("cat 无结果词.txt|wc -l") os.system("rm nopipei_word.txt")