seo应该都知道搜索结果中的网站地址在2013年前左右就加密了。通过数据抓包我们可以很清楚的看到这条链接是302跳转链接。而302跳转链接在head中都会带一条location的参数。Location的值就是真实url。
获取百度76页搜索结果页真实地址,以新浪为例
#encoding=utf-8 import requests,re,time,random,urllib word='site:ent.sina.com.cn inurl:s/m' l=[] op_txt=open('urls.txt','a+') for i in range(0,760,10): # print i baidu_url='https://www.baidu.com/s?wd=%s&pn=%d'%(urllib.quote_plus(word),i) urlfile=requests.get(url=baidu_url,timeout=20).content # re_url=re.compile(r'<h3 class="t">[\s\S]*?href="(.*?)" target="_blank"[\s\S]*?</a></h3>') re_url=re.compile(r'href = "(https://www\.baidu\.com/link\?url=.*?)"') url_list=re.findall(re_url,urlfile) for url in url_list: url=url.strip() # print url+'\n' header=requests.head(url).headers really_url=header['location'] if really_url not in l: l.append(really_url) op_txt.write(really_url+'\n') print really_url else: continue
对获取到的百度真实地址实施对新浪进行采集
#encoding=utf-8 import requests,re,time,random from readability.readability import Document from wordpress_xmlrpc import Client, WordPressPost from wordpress_xmlrpc.methods.posts import GetPosts, NewPost try: wp=Client('https://www.example.com/xmlrpc.php','账号','密码') except: wp=Client('https://www.example.com/xmlrpc.php','账号','密码') post=WordPressPost() def filter_tags(htmlstr): #先过滤CDATA re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style re_br=re.compile('<br\s*?/?>')#处理换行 # re_br=re.compile('<br />') # re_h=re.compile('</?\w+[^>]*>')#HTML标签 re_h=re.compile(r'<(?!\/?BR|\/?IMG)[^<>]*>',re.I)#HTML标签 re_comment=re.compile('<!--[^>]*-->')#HTML注释 s=re_cdata.sub('',htmlstr)#去掉CDATA s=re_script.sub('',s) #去掉SCRIPT s=re_style.sub('',s)#去掉style # s=re_br.sub('\n',s)#将br转换为换行 s=re_h.sub('',s) #去掉HTML 标签 s=re_comment.sub('',s)#去掉HTML注释 blank_line=re.compile('\n+')#去掉多余的空行 s=blank_line.sub('\n',s) s=replaceCharEntity(s)#替换实体 return s def replaceCharEntity(htmlstr): CHAR_ENTITIES={ 'nbsp':' ', '160':' ', 'lt':'<', '60':'<', 'gt':'>', '62':'>', 'amp':'&', '38':'&', 'quot':'"', '34':'"', } re_charEntity=re.compile(r'&#?(?P<name>\w+);') sz=re_charEntity.search(htmlstr) while sz: entity=sz.group()#entity全称,如> key=sz.group('name')#去除&;后entity,如>为gt try: htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1) sz=re_charEntity.search(htmlstr) except KeyError: #以空串代替 htmlstr=re_charEntity.sub('',htmlstr,1) sz=re_charEntity.search(htmlstr) return htmlstr def getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', ] ua = random.choice(uaList) return ua def gethtml(url,headers): while 1: try: # newip=ip() # proxies={"http":"https://%s"%newip.strip()} pages=requests.get(url,headers,timeout=10) html=pages.content code=pages.status_code if '302 Found' in html or code != 200 in html: print u'代理失效重试' continue else: return html except Exception, e: # print e continue def number(readable_article): a = re.sub(r'<script[\s\S]*?</script>| ','',readable_article).strip() b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>','',a).strip() text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,::。?、~@#¥%……&*()“”《》]+".decode("utf8"), "".decode("utf8"),b) #去除中英文标点符号 text2 = re.sub('<[^>]*?>','',text) #去除所有标签 words_number = len(text2) return int(words_number) url_list=[i.strip() for i in open('done_url.txt').readlines()] with open('urls.txt','r') as f: for url in f.readlines(): headers={'User-Agent':'%s'%getUA(),} html=gethtml(url,headers) readable_article = Document(html).summary() readable_title = Document(html).short_title() # a = re.sub(r'<script[\s\S]*?</script>| ','',readable_article).strip() # b = re.sub(r'<(?!p|img|/p|br|iframe)[^<>]*?>','',a).strip() # print readable_title num = number(readable_article) if num > 100 and url not in url_list: post.title=readable_title post.content=filter_tags(readable_article) post.post_status = 'publish' print 'helloworld' try: wp.call(NewPost(post)) print 'Updates' url_list.append(url) open('done_url.txt').wirte(url+'\n') except: print 'Noposts updates' continue