利用搜索引擎去除不相关关键词

折腾过大型词库的seo人员熟知利用百度凤巢,市面上的工具,规则生成等方法去挖掘关键词,会产生大量不相关的,主题偏移的关键词,如何去除这类词就成了让人头疼的事情。

这里有一个巧妙的方法,比如说我们扩展的是美食类的关键词,先用【美食】去百度上搜,获得大部分美食的网站,然后用这批关键词再去搜索,看看SERP的大部分网站是不是美食类,如果是,那么这个词就是与美食相关的,反之就去除。(据说百度内部也是利用自己的搜索来做一些相关的分类工作,具体有待考证)

先说说优点,可以用程序实现,快速省事可批量,缺点就是某些情况下,过于长尾的词或未被覆盖的词会命中少量的网站,从而被误删,这个也跟你的行业有关。解决办法是人工去看一下计算结果,不断调整清洗规则,总不能期待写个程序就完美解决。我曾经为了调整规则人工抽样看了一个多小时,当时词库数据量蛮大的。

用python实现比较简单,一个是脚本抓SERP,一个脚本做计算,下面是步骤:

1.抓取母词SERP的网站主域形成网站库

2.抓取扩展关键词的SERP,然后提取网站主域

3.计算每个关键词命中网站库的数量

4.调整规则,筛选关键词

其中1,2两步都尽可能通过修改百度url参数rn(搜索结果显示数量,最高50)获得较多的网站来提高准确性

如:

https://www.baidu.com/s?wd=1&rn=50&pn=10

上代码(脚本是很久之前写的,大家看个思路就好,有空再来更新)
1.抓取百度SERP

# coding:utf-8

import urllib,re,threading,random,pycurl,StringIO,time,csv
from bs4 import BeautifulSoup as bs

csvfile = open('src','wb') # 新建一个存放百度数据的csv文件

writer = csv.writer(csvfile,dialect='excel')
#TIME = 0.3

daili_list = [] # 新建一个存放代理IP的列表

# 从daili.txt文件中随机提取出1个IP
def ip():
    for x in open('src','r'):
    x = x.strip()
    daili_list.append(x)
    newip = random.choice(daili_list)
    daili_list[:] = [] # 新建一个存放代理IP的列表
    return newip

# 随机提取一个UA
def getUA():
    uaList = [
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',
    'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)',
    'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
    ]
    headers = random.choice(uaList)
    return headers

# 请求网页源代码,需要传入url,headers两个变量
def getHtml(url,headers):
    while 1:
        try:
            newip = ip()
            c = pycurl.Curl()
            c.setopt(pycurl.MAXREDIRS,2)
            c.setopt(pycurl.REFERER, url)
            #c.setopt(pycurl.FOLLOWLOCATION, True)
            c.setopt(pycurl.CONNECTTIMEOUT, 10)
            c.setopt(pycurl.TIMEOUT,15)
            c.setopt(pycurl.ENCODING,'gzip,deflate')
            c.setopt(c.PROXY,newip)
            c.fp = StringIO.StringIO()
            c.setopt(pycurl.URL, url)
            c.setopt(pycurl.HTTPHEADER,headers)
            c.setopt(c.WRITEFUNCTION, c.fp.write)
            c.perform()
            code = c.getinfo(c.HTTP_CODE)
            html = c.fp.getvalue()
            if '="https://verify.baidu.com' in html:
                print '出验证码,重试'
                continue
            elif '302 Found' in html or code != 200:
                print '代理失效,重试'
                continue
            else:
                return html

        except Exception, e:
            print e
            continue

# 线程要执行的内容
def getInfo(word):
    url = 'https://www.baidu.com/s?wd=%s&rn=50' % urllib.quote_plus(word) # 构造getHTMl()要请求的url

    #构造请求头信息
    headers = [
        "Host: www.baidu.com",
        "Connection: keep-alive",
        "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Upgrade-Insecure-Requests: 1",
        "User-Agent: %s" % getUA(),
        "Accept-Encoding: gzip, deflate, sdch",
        "Accept-Language: zh-CN,zh;q=0.8"
    ]

    html = getHtml(url,headers)

    mutex.acquire() #添加锁

    soup = bs(html)

    tag = soup.find_all('a', {'class':'c-showurl'})

    newline = str(tag)

    result = "".join(newline)

    data = []
    data.append(word)
    data.append(result)
    writer.writerow(data)

    print '>> 已抓取:%s' % word

    mutex.release() #释放锁

def getRange(l, r):
    for i in url_list[l:r]:
    getInfo(i)

    url_list = []
    for line in open('/home/wwwroot/ftpuser/cashwar_seo/seo_scrip/kw_score/jingpin_fc_kw_养殖_new.txt','r'):
    word = line.strip()
    url_list.append(word)

    totalThread = 50 #新建100个线程
    gap = (len(url_list) - 1) / totalThread #获取每个线程要抓的网页数
    mutex = threading.Lock() #threading.Lock()方法添加互斥锁

    for i in range(1, len(url_list),gap):
    t = threading.Thread(target=getRange,args=(i, i+gap,))
    t.start()

2.分析

#coding:utf-8

import csv,re,os

def main():
    csvfile = file('/home/wwwroot/ftpuser/cashwar_seo/seo_scrip/kw_score/serp_html.csv','rb')
    reader = csv.reader(csvfile)
    csvFinish = file('/home/wwwroot/ftpuser/cashwar_seo/seo_scrip/kw_score/result_finish.csv','wb')
    writer = csv.writer(csvFinish,dialect='excel')

    for line in reader:
        score = 0
        data = []
        word = line[0]
        html = line[1]
        print ">> COUNTING: %s" % word

    urlFile = open('/home/wwwroot/ftpuser/cashwar_seo/seo_scrip/kw_score/行业网站库.txt','r')

    for url in urlFile:
        newurl = url.strip()
        score_result = re.search(r'%s' % newurl,html)
        if score_result:
            score += 1

    urlFile.close()

    data.append(word)
    data.append(score)
    writer.writerow(data)

#os.system('python /home/wwwroot/ftpuser/cashwar_seo/seo_scrip/kw_score/baidu_data.py') #抓取百度数据
main()

#调用shell命令行分析(当query中含有字母时,SE会对SERP展现的URL里命中的部分加粗,所以使用sed替换<b>)
#os.system("cat result_finish.csv | awk -F, '{print $2}' | grep -v \"no\" | sed 's/<b>//g; s/&lt;\/b&gt;//g'|grep -Po \"(\w+(\-\w+)*\.){1,4}\w{1,}\" &gt;&gt; competitive_cache.txt")
#os.system("cat competitive_cache.txt|sort|uniq -c|sort -nr &gt; competitive_count.txt")
#os.system("rm competitive_cache.txt")
#print rank_dict</b></b>

(注:本文的主要理论来源于@zero发表过的一篇文章,现在找不到了,我进行了具体实现)
文章来源:www.cashwar.cn

1 thought on “利用搜索引擎去除不相关关键词”

Leave a Comment