【SEO】浅谈站群采集

开始一本正经胡说八道,一个人的站群:巨款域名,疯狂建站,大量模板,批量采集,归类聚合,刷新上线!聊聊采集内容到上线的重要知识点

一,关键词采集挖掘

请参考《Python批量挖掘百度凤巢关键词数据》《Python批量挖掘百度下拉框关键词》5118等

二,内容采集

可以跟网站内容沾边的都抓,不像定向爬虫受限于网页模板,也叫泛采集:python已经很多轮子:

使用方法非常简单,也就两三行代码,请自行阅读,本博客也有很多实例:包括通过使用正则表达式来去除html保留文本内容,百度一下也很多python去html保留文本:



html清洗

保留主要标签:p、img

删除标签中不重要的属性

a = re.sub(r'<(?!p|img|/p)[^<>]*?>','',content).strip() 
b = re.sub(r'<p[^>]*?>','<p>',a) 
newcontent = re.sub(r'alt="[^"]*?"','alt="%s"' % title,b).lower()
删除中文字数 < 100字的

text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),newcontent) 
text2 = re.sub('<[^>]*?>','',text) 
words_number = len(text2) 
去除垃圾信息

如“XXX网小编:XXX”、邮箱网址等。。。

三,http代理

个人使用阿布云,虽然贵,但是快,而且已有接入代码

# -*-*-
# 感谢骚男 『│網亊隨楓︵ (QQ: 332110637)』 提供的源代码
# -*-*-

#! -*- encoding:utf-8 -*-

from urllib import request

# 要访问的目标页面
targetUrl = "https://test.abuyun.com"
# targetUrl = "https://proxy.abuyun.com/switch-ip"
# targetUrl = "https://proxy.abuyun.com/current-ip"

# 代理服务器
proxyHost = "http-pro.abuyun.com"
proxyPort = "9010"

# 代理隧道验证信息
proxyUser = "H01234567890123P"
proxyPass = "0123456789012345"

proxyMeta = "https://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host" : proxyHost,
"port" : proxyPort,
"user" : proxyUser,
"pass" : proxyPass,
}

proxy_handler = request.ProxyHandler({
"http" : proxyMeta,
"https" : proxyMeta,
})

#auth = request.HTTPBasicAuthHandler()
#opener = request.build_opener(proxy_handler, auth, request.HTTPHandler)

opener = request.build_opener(proxy_handler)

# opener.addheaders = [("Proxy-Switch-Ip", "yes")]
request.install_opener(opener)
resp = request.urlopen(targetUrl).read()

print (resp)

四,CMS发布

Python发布内容帝国CMS

#encoding=utf-8
import MySQLdb,time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
 
time.strftime('%Y-%m-%d',time.localtime(time.time()))
 
 
a='2017年1-3月雅思考试报名截止日期'
b='教育部考试中心雅思考试报名网站今日公布2017年1-3月雅思考试、用于英国签证及移民的雅思考试、用于英国签证及移民的学术类机考、雅思生活技能类考试的考试报名截止日期、准考证打印日期和成绩单寄送日期。同学们已可以报名2017年雅思考试。具体报名截止日期如下(点击查看2017年1-3月雅思考试时间)'
 
title_list=[]
title_list.append(int(time.time()))
title_list.append(a)
 
 
contnet_list = []
contnet_list.append(b)
 
  
try:
    conn=MySQLdb.connect(host='localhost',user='root',passwd='root',db='empirecms',port=3306,charset='utf8')
    cur=conn.cursor()
 
    cur.execute("""INSERT INTO `phome_ecms_news` (`classid`, `userid`, `username`, `newstime`, `havehtml`, `title`, `smalltext`) VALUES ('2', '1', 'admin', %s, '1', %s, '')""",title_list)
    cur.execute("""update `phome_ecms_news` set filename = @@IDENTITY where id = @@IDENTITY""")
    cur.execute("""update `phome_ecms_news` set titleurl = concat('/sanwen/',@@IDENTITY,'.html') where id = @@IDENTITY""")
    cur.execute("""INSERT INTO `phome_ecms_news_data_1` (`id`, `classid`, `dokey`, `newstext`) VALUES (@@IDENTITY, '2', '1', %s); """,contnet_list)
    cur.execute("""INSERT INTO `phome_ecms_news_index` (`id`, `classid`, `checked`, `havehtml`) VALUES (@@IDENTITY, '2',  '1', '1')""")
 
    conn.commit()
    cur.close()
    conn.close()
except MySQLdb.Error,e:
     print "Mysql Error %d: %s" % (e.args[0], e.args[1])

Python发布内容到织梦

# coding:utf-8


import pycurl,StringIO
import sys
reload(sys)
sys.setdefaultencoding('utf8')

headers = [
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
"Cookie:PHPSESSID=a8p9lr4jksj5kh9k3nhjaa5pu7; DedeUserID=1; DedeUserID__ckMd5=0023ecbe319d14d9; DedeLoginTime=1470919547; DedeLoginTime__ckMd5=b05a3b1e5c4bce22",
"Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryS1tN0Ueh3In2YSN8",
]

def curl(url):
c = pycurl.Curl()
c.setopt(pycurl.REFERER, 'https://yourdomain.com/dede/article_add.php?channelid=1')
c.setopt(pycurl.FOLLOWLOCATION, True)
c.setopt(pycurl.MAXREDIRS,5)
c.setopt(pycurl.CONNECTTIMEOUT, 60)
c.setopt(pycurl.TIMEOUT,120)
c.setopt(pycurl.ENCODING, 'gzip,deflate')
# c.setopt(c.PROXY,ip)


c.fp = StringIO.StringIO()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER,headers)
c.setopt(pycurl.POST, 1)
c.setopt(pycurl.POSTFIELDS, data)
c.setopt(c.WRITEFUNCTION, c.fp.write)
c.perform()


code = c.getinfo(c.HTTP_CODE) #返回状态码

html = c.fp.getvalue() #返回源代码

return html

title = 'ITSEO培训怎么样'.encode('gbk','ignore')
content = 'TSEO讲师 多年excel实战经验任职某通讯公司seo岗位,长期接触各类... 关注 私信 夜息 ITSEO创始人 ITSEO创始人,原途牛seo负责人。SEO顾问服务过多个行...'.encode('gbk','ignore')

data = open('data.txt').read() #通过fiddler把提交的数据下载到本地(点击下图红框另存到到本地,编码不需更改),命名为data.txt data = open('data.txt').read(),data直接作为post表单的数据即可。
data = data.replace('title@123456',title).replace('content@123456',content)

print curl('https://yourdomain.com/dede/article_add.php')

 

Python发布内容到phpcms

# coding=utf-8
'''功能:采集百度新闻(https://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。
 主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次
'''
import pycurl,StringIO,json,urllib,urllib2,re
import MySQLdb
import time 
from warnings import filterwarnings
import MySQLdb as Database
filterwarnings('ignore', category = Database.Warning) 
import sys
reload(sys)
sys.setdefaultencoding('utf8')
 
 
headers = [
    "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
    "Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226",
]
 
def curl(url):
    c = pycurl.Curl()   #通过curl方法构造一个对象
    #c.setopt(pycurl.REFERER, 'https://qy.m.58.com/')    #设置referer
    c.setopt(pycurl.FOLLOWLOCATION, True)   #自动进行跳转抓取
    c.setopt(pycurl.MAXREDIRS,5)            #设置最多跳转多少次
    c.setopt(pycurl.CONNECTTIMEOUT, 60)     #设置链接超时
    c.setopt(pycurl.TIMEOUT,120)            #下载超时
    c.setopt(pycurl.ENCODING, 'gzip,deflate')
    # c.setopt(c.PROXY,ip)  # 代理
    c.fp = StringIO.StringIO()  
    c.setopt(pycurl.URL, url)   #设置要访问的URL
    c.setopt(pycurl.HTTPHEADER,headers)     #传入请求头
    # c.setopt(pycurl.POST, 1)
    # c.setopt(pycurl.POSTFIELDS, data)     #传入POST数据
    c.setopt(c.WRITEFUNCTION, c.fp.write)   #回调写入字符串缓存
    c.perform()
 
    code = c.getinfo(c.HTTP_CODE)   #返回状态码
    html = c.fp.getvalue()  #返回源代码
    return html
 
# 通过正则提取元素
def search(req,html):
    text = re.search(req,html)
    if text:
        data = text.group(1)
    else:
        data = 'no'
    return data
 
# 去除文章url、多余标签等、补全路径等
def content_sort(content):
    content = re.sub('<p.*?>','<p>',content,flags=re.I)
    content = re.sub('<P.*?>','<p>',content)
    content = re.sub('</?span.*?>','',content)
    content = re.sub('</?a.*?>','',content)
    content = re.sub('<!.*?>','',content)
    content = re.sub('</?img.*?>','',content,re.IGNORECASE)
    content = re.sub('</?IMG.*?>','',content,re.IGNORECASE)
    content = re.sub('</?div.*?>','',content,flags=re.I)
    content = re.sub('</?DIV.*?>','',content)
    content = re.sub('</?iframe.*?>','',content)
    content = re.sub('</?center.*?>','',content)
    content = re.sub('</?[fF].*?>','',content)
    content = re.sub('<script.*?>[\s\S]*?</script>','',content)
    content = re.sub('</?strong.*?>','',content)
    content = re.sub('<INPUT.*?>','',content)
    content = re.sub('<style.*?>[\s\S]*?</style>','',content)
    content = re.sub(' ','',content)
    content = re.sub(' ','',content)
    content = re.sub(' ','',content)
    return content
 
#域名与正则、编码对应表
req_dict = {
'finance.sina.com.cn':
{'title':'<h1.*?>(.*?)</h1>','content':'<!-- 原始正文start -->([\s\S]*?)<!-- 原始正文end -->','decode':'utf-8'},
'stock.eastmoney.com':
{'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'},
 
'finance.eastmoney.com':
{'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'},#ok
 
'guba.eastmoney.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div id="zwconbody">([\s\S]*?)<div class="zwconbtns clearfix">','decode':'utf-8'},#ok
'stock.jrj.com.cn':
{'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'},
'hk.jrj.com.cn':
{'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'},
'hkstock.cnfol.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok
'sc.stock.cnfol.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok
'money.163.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div class="post_text".*?">([\s\S]*?)<!--.*?s -->','decode':'utf-8'},
 
'www.chinastock.com.cn':
{'title':'<div class="d_title">([\s\S]*?)</div>','content':'<div class="d_content" id="Zoom">([\s\S]*?)<div class="dleft_new_attachment">','decode':'utf-8'},
'stock.huagu.com':
{'title':'<h1 id="h1-title">([\s\S]*?)</h1>','content':'<div class="article_con" id="div-article-content">([\s\S]*?)<div class="clear"></div>','decode':'utf-8'},
'stock.sohu.com':
{'title':'<h1 itemprop="headline">([\s\S]*?)</h1>','content':'<div itemprop="articleBody">([\s\S]*?)<div class="original-title"','decode':'gbk'},
'stock.cngold.org':
{'title':'<title>(.*?)-.*?</title>','content':'<div class="det_content" id="zoom">([\s\S]*?)<div class="listPage">','decode':'utf-8'},
'hk.stock.hexun.com':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'utf-8'},
'stock.gucheng.com':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="content">([\s\S]*?)</div>','decode':'utf-8'},
'www.cnstock.com':
{'title':'<title>(.*?)-.*?</title>','content':'<div class="content-inner" id="qmt_content_div">([\s\S]*?)</div>','decode':'gbk'},
'www.ccstock.cn':
{'title':'<title>(.*?)-.*?</title>','content':'<div id="newscontent">([\s\S]*?)</div>','decode':'utf-8'},
'news.emoney.cn':
{'title':'<title>(.*?)-.*?</title>','content':'<div class="RL_details_content">([\s\S]*?)<div class="PageNav">','decode':'utf-8'},
 
'finance.ce.cn':
{'title':'<title>(.*?)</title>','content':'<div class=TRS_Editor>([\s\S]*?)<textarea id="allinfo"','decode':'gbk'},
 
'www.p5w.net':
{'title':'<title>(.*?)[_-|].*?</title>','content':'<div class="text">([\s\S]*?)<div class="pages">','decode':'gbk'},
 
'www.nbd.com.cn':
{'title':'<title>(.*?)[_-|][\s\S]*?</title>','content':'<div class="main-left-article">([\s\S]*?)<div style="overflow:','decode':'utf-8'},
 
'stock.hexun.com':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'gbk'},
 
'stock.caijing.com.cn':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div id="the_content".*?>([\s\S]*?)<div class="ar_writer"','decode':'utf-8'},
}
 
 
def id():
    '''获取标题对应id,构建url.我用的是phpcms,前台显示需将url写入数据库'''
    con = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8')
    with con:
        cur = con.cursor() 
        cur.execute("select id from v9_news where title = title") 
        numrows = int(cur.rowcount)
        return numrows+1
 
def CmsSQL(title,content):
    '''写入数据,如何将多个数据写入数据库可参考'''
    value1 = []
    value1.append(content)
    value1.append(idnum)
 
    value2 = []
    value2.append(title)
    value2.append(urlid)
    value2.append(int(time.time()))
    value2.append(int(time.time()))
 
    db = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8')
    cursor = db.cursor() 
    cursor.execute("insert into v9_news_data (content,id) values(%s,%s)" ,value1)
    cursor.execute("insert into v9_news(title,catid,typeid,url,inputtime,updatetime) values(%s,6,0,%s,%s,%s)",value2)
 
    db.commit()
    db.close()
 
 
url = 'https://news.baidu.com/n?cmd=4&class=gegu&tn=rss'
urls = re.findall(r'<link><!\[CDATA\[(.*?)\]\]></link>',curl(url))
urls.reverse()
for url in urls:
    with open('urls.txt') as f1 :
        if url not in f1.read(): #判断url是否采集过
            url.strip()
            f1.close()
            line = url.split('/')[2]
            if req_dict.has_key(line): #通过键位是否存在判断这个网站是否写好的正则
                time.sleep(1)
                try:
                    title = search(req_dict[line]['title'],curl(url)).decode(req_dict[line]['decode']) #网址与正则及网页编码对应起来
                    content = url + search(req_dict[line]['content'],curl(url)).decode(req_dict[line]['decode']) 
                except:
                    continue
                urlid = 'https://localhost/index.php?m=content&c=index&a=show&catid=6&id=%s' %id()
                idnum = int(id())
                print id(),content_sort(title)
                CmsSQL(content_sort(title),content_sort(content))
                f1w =open('urls.txt','a+')
                f1w.write(url+'\n')
                f1w.close()
            else:
                print u'正则不存在'
                open('requrl','a+').write(url+'\n')
        else:
            print u'此url在列表中:'

Python发布内容到wordpress

#encoding=utf-8
'''练手可以找wp博客来采集,这个脚本就是针对wp博客来做下手采集的'''
import re,requests,time,random,urllib,threading,threadpool
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
 
'''登录'''
try:
    wp=Client('https://www.example.com/xmlrpc.php','wp的账号','wp的密码')
except Exception, e:
    wp=Client('https://www.example.com/xmlrpc.php','wp的账号','wp的密码')
post=WordPressPost()
 
 
'''针对单站url重复采集问题'''
f=open('url.txt','a+')
urls=f.read()
url_list=[m.strip() for m in open('url.txt').readlines()]
daili_list=[]
 
 
'''过滤html标签'''
def filter_tags(htmlstr):
    re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA
    re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
    re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style    re_br=re.compile('<br\s*?/?>')#处理换行
    re_br=re.compile('<br />')
    re_h=re.compile('</?\w+[^>]*>')#HTML标签
    re_comment=re.compile('<!--[^>]*-->')#HTML注释
    s=re_cdata.sub('',htmlstr)#去掉CDATA
    s=re_script.sub('',s) #去掉SCRIPT
    s=re_style.sub('',s)#去掉style
    s=re_br.sub('\n',s)#将br转换为换行
    s=re_h.sub('',s) #去掉HTML 标签
    s=re_comment.sub('',s)#去掉HTML注释
    blank_line=re.compile('\n+')#去掉多余的空行
    s=blank_line.sub('\n',s)
    return s
 
'''轮换user-agent'''
def getUA():
    uaList = [
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',
    'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)',
    'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    ]
    ua = random.choice(uaList)
    return ua
 
'''提取正则'''
def search(re_url,html):
    re_Data=re.findall(re_url,html)
    if re_Data:
        return re_Data[0]
    else:
        return 'no'
 
'''轮换ip'''
def ip():
    for x in open('daili.txt'):
        x = x.strip()
        daili_list.append(x)
    newip = random.choice(daili_list)
    return newip
 
 
'''获取html'''
def gethtml(url,headers):
    while 1:
        try:
            newip=ip()  
            proxies={"http":"https://%s"%newip.strip()}  
            pages=requests.post(url,headers,proxies,timeout=10)
            html=pages.content
            code=pages.status_code
            if '404' '302 Found' in html or code != 200 in html:
                print u'代理失效重试'
                continue
            elif 'verify' in html:
                print u'出验证码,重试'
                continue
            else:
                return html
        except Exception, e:
            # print e
            continue
 
'''正则用以提取列表页上的url,需根据实际情况来调整'''
re_url=re.compile(r'<a href="(https://www\.example\.com/.*?\d+\.html)"')
 
'''正则用以提取内页上的title和正文内容content,当然也可以使用readability模块,正则需根据实际情况做修改'''
re_title_content=re.compile(r'<h1 class="entry-title">(.*?)</h1>[\s\S]*?<div class="entry-content">([\s\S]*?)<div class="clear">')
 
 
 
'''成功通过wordpress-xmlrpc模块自动发布文章到wordpress'''
def getData(url):
    headers={'User-Agent':'%s'%getUA(),}  
    mutex.acquire()
    html=gethtml(url,headers)
    re_Data=re.findall(re_url,html)
     
    for i in re_Data:
        i=i.strip()
        if i not in url_list and i not in urls:
            page=gethtml(i,headers)
            page_Data=re.findall(re_title_content,page)
            for n in page_Data:
                # print type(n)
                try:
                    title=n[0]
                    content=filter_tags(n[1]) `
                except:
                    title=0
                    content=0
            if title and content:
                print title,content
                '''发布到wp'''
                # post.title=title
                # post.content=content
                # post.post_status = 'publish'
                # wp.call(NewPost(post))
 
                url_list.append(i)
                 
                f.writelines(i+'\n')
 
                print 'Updates'
            else:
                pass
        else:
            print 'Noposts updates'
            continue      
    mutex.release()
 
 
def now_time(url):
    for i in url_list:
        getData(i)
 
url_list = []
for line in range(1,12):
    line = 'https://www.example.com/page/%d'%line
    word = line.strip()
    url_list.append(word)
 
mutex = threading.Lock()
pool = threadpool.ThreadPool(3)
reqs = threadpool.makeRequests(now_time, url_list)
[pool.putRequest(req) for req in reqs]
pool.wait()

Leave a Comment