“破解”百度搜索结果中的加密地址&采集

seo应该都知道搜索结果中的网站地址在2013年前左右就加密了。通过数据抓包我们可以很清楚的看到这条链接是302跳转链接。而302跳转链接在head中都会带一条location的参数。Location的值就是真实url。
获取百度76页搜索结果页真实地址,以新浪为例

#encoding=utf-8
import requests,re,time,random,urllib

word='site:ent.sina.com.cn inurl:s/m'
l=[]
op_txt=open('urls.txt','a+')

for i in range(0,760,10):
    # print i 
    baidu_url='http://www.baidu.com/s?wd=%s&pn=%d'%(urllib.quote_plus(word),i)

    urlfile=requests.get(url=baidu_url,timeout=20).content
    # re_url=re.compile(r'<h3 class="t">[\s\S]*?href="(.*?)" target="_blank"[\s\S]*?</a></h3>')
    re_url=re.compile(r'href = "(http://www\.baidu\.com/link\?url=.*?)"')
    url_list=re.findall(re_url,urlfile)
    for url in url_list:
        url=url.strip()
        # print url+'\n'
        header=requests.head(url).headers
        really_url=header['location']
        if really_url not in l:
            l.append(really_url)
            op_txt.write(really_url+'\n')
            print really_url
        else:
            continue

对获取到的百度真实地址实施对新浪进行采集

#encoding=utf-8
import requests,re,time,random
from readability.readability import Document
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods.posts import GetPosts, NewPost

try:
    wp=Client('http://www.example.com/xmlrpc.php','账号','密码')
except:
    wp=Client('http://www.example.com/xmlrpc.php','账号','密码')

post=WordPressPost()

def filter_tags(htmlstr):
    #先过滤CDATA
    re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA
    re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
    re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style    re_br=re.compile('<br\s*?/?>')#处理换行
    # re_br=re.compile('<br />')
    # re_h=re.compile('</?\w+[^>]*>')#HTML标签
    re_h=re.compile(r'<(?!\/?BR|\/?IMG)[^<>]*>',re.I)#HTML标签
    re_comment=re.compile('<!--[^>]*-->')#HTML注释
    s=re_cdata.sub('',htmlstr)#去掉CDATA
    s=re_script.sub('',s) #去掉SCRIPT
    s=re_style.sub('',s)#去掉style
    # s=re_br.sub('\n',s)#将br转换为换行
    s=re_h.sub('',s) #去掉HTML 标签
    s=re_comment.sub('',s)#去掉HTML注释   
    blank_line=re.compile('\n+')#去掉多余的空行
    s=blank_line.sub('\n',s)
    s=replaceCharEntity(s)#替换实体
    return s


def replaceCharEntity(htmlstr):
    CHAR_ENTITIES={
        'nbsp':' ',
        '160':' ',
        'lt':'<',
        '60':'<',
        'gt':'>',
        '62':'>',
        'amp':'&',
        '38':'&',
        'quot':'"',
        '34':'"',
    }
    re_charEntity=re.compile(r'&#?(?P<name>\w+);')

    sz=re_charEntity.search(htmlstr)
    while sz:
        entity=sz.group()#entity全称,如&gt;
        key=sz.group('name')#去除&;后entity,如&gt;为gt
        try:
            htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1)
            sz=re_charEntity.search(htmlstr)
        except KeyError:
        #以空串代替
            htmlstr=re_charEntity.sub('',htmlstr,1)
            sz=re_charEntity.search(htmlstr)
        return htmlstr




def getUA():
    uaList = [
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',
    'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)',
    'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)',
    'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
    ]
    ua = random.choice(uaList)
    return ua

def gethtml(url,headers):
    while 1:
        try:
            # newip=ip()  
            # proxies={"http":"http://%s"%newip.strip()}  
            pages=requests.get(url,headers,timeout=10)
            html=pages.content
            code=pages.status_code
            if '302 Found' in html or code != 200 in html:
                print u'代理失效重试'
                continue
            else:
                return html
        except Exception, e:
            # print e
            continue

def number(readable_article):
    a = re.sub(r'<script[\s\S]*?</script>|&#13;','',readable_article).strip()
    b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>','',a).strip()
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),b)  #去除中英文标点符号
    text2 = re.sub('<[^>]*?>','',text)  #去除所有标签
    words_number = len(text2)
    return int(words_number)



url_list=[i.strip() for i in open('done_url.txt').readlines()]


with open('urls.txt','r') as f:
    for url in f.readlines():
        
        headers={'User-Agent':'%s'%getUA(),}
        html=gethtml(url,headers)
        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        # a = re.sub(r'<script[\s\S]*?</script>|&#13;','',readable_article).strip()
        # b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>','',a).strip()
        # print readable_title
        num = number(readable_article)
        if num > 100 and url not in url_list:
            post.title=readable_title
            post.content=filter_tags(readable_article)
            post.post_status = 'publish'
            print 'helloworld'
            try:
                wp.call(NewPost(post))
                print 'Updates'
                url_list.append(url)
                open('done_url.txt').wirte(url+'\n')

            except:
                print 'Noposts updates'
                continue

发表评论

电子邮件地址不会被公开。