#-------------------------shell------------------------- #两文件合并列 paste -d " " file1 file2 > mergefile #第N列符合某条件的数字求和 cat file|awk '{if($N~"regex")print $E}'|awk {sum+=$0}END{print sum} or cat file|awk '{s[$1]+=$2}END{for(i in s){print i,s[i]}}' #curl中文乱码(gzip压缩导致) curl -H "Accept-Encoding: gzip" www.domain.net|gunzip|more #文件分割 split -8000 file -d -a 1 word #2个文件,每个2列,将他们按照第一列相同的数,来合并成一个三列的文件,同时,将每个文件中针对第一列对应第二列中没有的数补0 awk 'FNR==NR{a[$1]=$2}FNR<NR{a[$1]?a[$1]=a[$1]" "$2:a[$1]=a[$1]" 0 "$2}END{for(i in a)print i,a[i]}' file1 file2 > file3 or awk -F"," 'NR==FNR{a[$1]=$2;next}{print $0","a[$2]}' file1 file2|sed -E 's/,$/,0/g' #换行符|回车符替换(\n \r \n\r) perl -p -e 's/\n//' filename
#-------------------------aliyun/mysql------------------------- #登陆 ssh -i {keyfile} -l work ***.***.***.*** #FTP scp -i {keyfile} {filename} work@***.***.***.***:/{directory} mysql -u{***} -p{***} -h{***} #登陆mysql show databases; #显示所有数据库 use kanzhun #查看××库 show tables; #显示库中的所有表 describe table_name #显示××表的数据结构 #随机抽取5000条记录 select id from company order by rand() limit 5000; #多表关联查询 select company.full_name,city.name from city,company,company_salary where company_salary.company_id = company.id and company_salary.city_code = city.code order by rand() limit 5000; #正则查找 select infor from daoru where infor regexp ‘^&’; # 替换 update 表名 set 字段名=replace(字段名,',',',') # 替换字段中的换行符 update 表名 set 字段名= replace(字段名,char(13)+char(10),''); update 表名 set 字段名= replace(字段名,char(13),''); update 表名 set 字段名= replace(字段名,char(10),'');
#-------------------------python------------------------- #---网页采集--- import pycurl,StringIO,sys,chardet,random,requests,urllib,time,urllib2 def getHead(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', ...... ] headers = random.choice(uaList) return headers #代理ip可用性验证 def proxycheckone(proxy): url='https://www.baidu.com/s?wd=python' proxy_url = proxy proxy_support = urllib2.ProxyHandler({'http': proxy_url}) #设定使用proxy opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) r=urllib2.Request(url) #r.add_header("Accept-Language","utf-8") #加入头信息,这样可避免403错误 #r.add_header("Content-Type","text/html; charset=utf-8") #r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)") trycount=1 while trycount<=2: #尝试2次 try: T0=time.time() #开始时间 f=urllib2.urlopen(r,timeout=2) data=f.read() if '百度搜索' in data and 'https://verify.baidu.com' not in data: #判断到网页信息中有baidu字符,说明通过此proxy连接baidu可以成功 T=time.time()-T0 #得出最终连接所用时间 break else:return [] except: time.sleep(1) trycount=trycount+1 if trycount>2: return [] else: print '地址:'+proxy+' 连接速度:'+str(T) #此信息会写入一个文本文件 return proxy def daili_ip(dailistr): daili_list = dailistr.split(',') ip = random.choice(daili_list) return ip def getHtml(url,headers,ip): while 1: try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING, 'gzip,deflate') c.setopt(pycurl.USERAGENT,headers) c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) #c.setopt(pycurl.HTTPHEADER,["Accept-Encoding:gzip,deflate,sdch"]) #c.setopt(pycurl.HTTPHEADER,header_list) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 html = c.fp.getvalue() return html except: continue for ip in daili_list: ceshi_ip_a = proxycheckone(ip) daili_list_str.append(ceshi_ip_a) daili_str = ','.join(daili_list_str) for url in open('url.txt'): content = getHtml(url,getHead(),daili_ip(dailistr)) typeEncode = sys.getfilesystemencoding() infoencode = chardet.detect(content).get('encoding','utf-8') html = content.decode(infoencode,'ignore').encode(typeEncode) print html #---下载图片--- import urllib,os filepath=os.getcwd() if os.path.exists(filepath) is False: os.mkdir(filepath) x=1 print u'爬虫准备就绪...' for line in open('logo_url.txt'): line = line.strip() id = line.split(',')[1] imgurl = line.split(',')[2] temp= '%s.jpg' % id print u'正在下载第%s张图片' % x print imgurl try: urllib.urlretrieve(imgurl,temp) x+=1 except: continue print u'图片下载完毕,保存路径为'+filepath
来自于闯哥www.kaopuseo.com