《Python多线程代理查百度收录》简说:
1,对比requests和pycurl两个爬虫模块,代码已经敲好,大家可回家自行折腾
2,没有跑很多数据来测试程序的稳定性,因为本来就有脚本在服务器上跑了,懒
3,懒,就折腾了一下队列Queue,多线程,面向对象,日后自行完善
4,功能具备轮换ip,cookies,ua,多线程等渣渣
5,如有不能使用的,就考虑换个运行环境,实在不行自己一边玩去
#coding: utf-8 import requests,threading,Queue,random,re,time,pycurl,StringIO,urllib class CheckStatus(threading.Thread): def __init__(self, queue): super(CheckStatus, self).__init__() self.queue = queue self.op_txt=open('no_index.txt','a') self.daili_list=[] def getUA(self): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" ] headers = random.choice(uaList) return headers def daili_cookie(self): cookie_list = [ 'BIDUPSID=4B0DC2F54860625BA83681F98C507951; BAIDUID=791ED7F86F43AF44A3808AB244404E1A:FG=1; PSTM=1441808712; BDUSS=RINjR4TVFBeHpKLTNIREJ4MkFUT0h3SFdFWlQwdHJIdlZORzc5aW00QWpnQ2hXQVFBQUFBJCQAAAAAAAAAAAEAAAAJkstJv7TXvMTj1NnM-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACPzAFYj8wBWd0; BDSFRCVID=tc4sJeC62wRkfgj40DCH-qjWNeMhJHrTH6aov8OLjxwzgCDAMXfsEG0Pt7lQpYD-MjrsogKK0mOTHUcP; H_BDCLCKID_SF=JJ4O_C-5tCv8fjrzhJbM-J3H-UnLq5btX57Z0lOnMp05jpjDjT823PFTKPKtaxTnW56uXRPyMn3zODO_e6-bDjQ3DaAs-460aK_X3bRVKbk_jR-k-PnVep8qQhbZKxJmMgkeoxJtJK-2SnbVKU5mytKXhq6qWnvN3mn2LIOFfDDbbDtxD5_32JLHqx5Ka43tHD7yWCvd-M75OR5JLn7nDUFdhpDJJpvm3Ibv3xQ73hbAVUnjqt8hXpjyyGCftj_JtnIeVb3HbTrMHJo1btQhq4tehHRJ553eWDTm_Do5LJvtenFmDMOTyKuLMRJwKxr3WebH-pPKKR7-bh7sMR7b24-dQ-QuXP5e3mkjbP-5aUj2oq-zXt6KKP4syP4j2xRnWNT2bIcJ-J8XhI86j5rP; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_HOME=1; BD_UPN=123253; sug=3; sugstore=1; ORIGIN=0; bdime=0; H_PS_645EC=5894fstaLnB%2Bx%2F1GkrMZWqKZiK7vVRh2YO9qL7vORnC1%2BY%2BbXOz%2BVwgRSuL80CXajur4; WWW_ST=1443000293566; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BD_CK_SAM=1; BDSVRTM=146; H_PS_PSSID=17143_16716_1431_17100_12824_14430_12867_17245_17104_17182_17000_17003_17073_15864_17348_12413_13932_17351_14924_17050', 'BAIDUID=1F63B9A436CE0DBA3C7D1849367F30CB:FG=1; BIDUPSID=1F63B9A436CE0DBA3C7D1849367F30CB; PSTM=1441517552; BD_UPN=13314452; ispeed_lsm=10; ispeed=1; sug=3; ORIGIN=0; bdime=0; BDUSS=m5TYjhuODBCWHpQcVNYV2FDeS1BLUFzV0t3WTQwcTctUkV2S2x6M1ZBcjZMU2RXQVFBQUFBJCQAAAAAAAAAAAEAAAChsHQiuqPAtjIyOQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPqg~1X6oP9Vc; H_PS_645EC=217efvXBesXqzUCKdQMslc2uc5TwenrsDDar8Tir0uHuQfpJAglN689%2BHSNYep8LeRTy; BD_HOME=1; H_PS_PSSID=16230_17326_1447_12657_12824_14432_12867_17246_17105_14952_17001_17004_17072_15713_17347_11798_13932_17352_14554_17051; __bsi=12190823682724921622_00_0_I_R_166_0303_C02F_N_I_I_0; sugstore=1', 'Cookie: BAIDUID=1F63B9A436CE0DBA3C7D1849367F30CB:FG=1; BIDUPSID=1F63B9A436CE0DBA3C7D1849367F30CB; PSTM=1441517552; BDUSS=m5TYjhuODBCWHpQcVNYV2FDeS1BLUFzV0t3WTQwcTctUkV2S2x6M1ZBcjZMU2RXQVFBQUFBJCQAAAAAAAAAAAEAAAChsHQiuqPAtjIyOQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPqg~1X6oP9Vc; H_PS_PSSID=16230_17326_1447_12657_12824_14432_12867_17246_17105_14952_17001_17004_17072_15713_17347_11798_13932_17352_14554_17051; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0' ] cookie = random.choice(cookie_list) return cookie def ip(self): for x in open('/root/aliyun/seo/daili.txt'): x = x.strip() self.daili_list.append(x) newip = random.choice(self.daili_list) return newip def getHtml(self,line,headers): # print u'开始获取网页源码' while 1: try: url = 'https://www.baidu.com/s?wd=%s' % urllib.quote_plus(line.strip()) # newip = self.ip() # proxies={"http": "https://%s"%newip.strip()} # c = requests.post(url=url,headers=headers,proxies=proxies,timeout=30) # # c=requests.post(url,headers,timeout=10) # html = c.content c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 120) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') #c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() code = c.getinfo(c.HTTP_CODE) #返回状态码 html = c.fp.getvalue() if '="https://verify.baidu.com' in html: print u'出验证码,重试' continue elif '302 Found' in html or code != 200: print u'代理失效,重试' continue else: return html except Exception, e: print e continue def run(self): while True: url = self.queue.get() status_index = self.getIndex(url) def getIndex(self, url): # headers={ # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', # 'Accept-Encoding':'gzip, deflate, sdch', # 'Accept-Language':'zh-CN,zh;q=0.8', # 'Connection':'keep-alive', # # 'Cookie':'%s'%self.daili_cookie(), # 'Host':'www.baidu.com', # 'Upgrade-Insecure-Requests':'1', # 'User-Agent':'%s' % self.getUA(), # } headers = [ "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Cache-Control:max-age=0", "Connection:keep-alive", #"Cookie:BAIDUID=18BFE1C8A802F8458F26D043CD7CD624:FG=1; BDUSS=lpaNUg2NkloQTBKVVh4aVBsczJNLUc2QjEzN05wMXUzeE50WXZSQVNaRmU3WlZWQVFBQUFBJCQAAAAAAAAAAAEAAAAJkstJv7TXvMTj1NnM-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAF5gblVeYG5Vb; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a01833473155; BDSFRCVID=Vy8sJeCCxG3TKh3lHco6WY5CFWPhzzDzLlKH3J; H_BDCLCKID_SF=JbAjoKK5tKvbfP0kh-QJhnQH-UnLq5JIH67Z0lOnMp05ShvdDPv12bTL-q5mhU70LIbEXqbLBnRvOKO_e6t5D5J0jN-s-bbfHDJK0b7aHJOoDDvK2j75y4LdLp7xJh3i2n7QanOOJf3ZMqOD3p3s2RIv24vQBMkeWJQ2QJ8BJD_2hI3P; BIDUPSID=18BFE1C8A802F8458F26D043CD7CD624; PSTM=1433406316; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_UPN=123253; sug=3; sugstore=1; ORIGIN=0; bdime=0; H_PS_645EC=2002DrwijyvB4e2cepMJ9FuSgzu6vKJjbMOeRrfZjipiNRVem6mc9uqx%2FBzqlM7Z; BD_CK_SAM=1; BDSVRTM=14; H_PS_PSSID=13372_1428_14602_12772_14509_14444_10812_14600_12868_14622_10562_14501_12723_14626_14485_14244_11460_13936_8498 "Host:www.baidu.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.10.4", "User-Agent:%s" % self.getUA() ] html = self.getHtml(url,headers) # print html if '抱歉,没有找到与' in html or '没有找到该URL' in html: print u'%s,未收录,写入文档'%url self.op_txt.writelines('%s\n'%url) else: print u'%s,已收录'%url self.queue.task_done() def main(): queue = Queue.Queue() for url in open("allurl.txt"): queue.put(url.strip()) for i in range(10): t = CheckStatus(queue) t.setDaemon(True) t.start() queue.join() print "done." if __name__ == '__main__': main()
实现功能:不要任性,就只是查个百度收录
先安装,requests和pycurl模块:requests语法简洁
pip install requests
pip install pycurl
运行:
python shoulu.py
结果:未被收录的写入文档
希望程序稳定,不然就尴尬了、、不过我也不care,呵呵
更新一下,听说是9月1号,好了,下课了!居然貌似还是没凑够字数、、、