什么乱七八糟的备份

#-------------------------shell-------------------------

#两文件合并列
paste -d " " file1 file2 > mergefile

#第N列符合某条件的数字求和
cat file|awk '{if($N~"regex")print $E}'|awk {sum+=$0}END{print sum}

or

cat file|awk '{s[$1]+=$2}END{for(i in s){print i,s[i]}}'

#curl中文乱码（gzip压缩导致）
curl -H "Accept-Encoding: gzip" www.domain.net|gunzip|more

#文件分割
split -8000 file -d -a 1 word

#2个文件，每个2列，将他们按照第一列相同的数，来合并成一个三列的文件,同时，将每个文件中针对第一列对应第二列中没有的数补0

awk 'FNR==NR{a[$1]=$2}FNR<NR{a[$1]?a[$1]=a[$1]" "$2:a[$1]=a[$1]" 0 "$2}END{for(i in a)print i,a[i]}' file1 file2 > file3

or

awk -F"," 'NR==FNR{a[$1]=$2;next}{print $0","a[$2]}' file1 file2|sed -E 's/,$/,0/g'

#换行符|回车符替换（\n \r \n\r）
perl -p -e 's/\n//'  filename

#-------------------------aliyun/mysql-------------------------
#登陆
ssh -i {keyfile} -l work ***.***.***.***

#FTP
scp -i {keyfile} {filename} work@***.***.***.***:/{directory}

mysql -u{***} -p{***} -h{***}    #登陆mysql
show databases; #显示所有数据库
use kanzhun     #查看××库
show tables;     #显示库中的所有表
describe table_name     #显示××表的数据结构

#随机抽取5000条记录
select id from company order by rand() limit 5000;

#多表关联查询
select company.full_name,city.name from city,company,company_salary where company_salary.company_id = company.id and company_salary.city_code = city.code order by rand() limit 5000;


#正则查找
select infor from daoru where infor regexp ‘^&’;

# 替换
update 表名 set 字段名=replace(字段名,',','，')


# 替换字段中的换行符
update 表名 set 字段名= replace(字段名,char(13)+char(10),'');
update 表名 set 字段名= replace(字段名,char(13),'');
update 表名 set 字段名= replace(字段名,char(10),'');

#-------------------------python-------------------------
#---网页采集---
import pycurl,StringIO,sys,chardet,random,requests,urllib,time,urllib2

def getHead():
    uaList = [
    'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
    ......
    ]
    headers = random.choice(uaList)
    return headers

#代理ip可用性验证
def proxycheckone(proxy):
    url='https://www.baidu.com/s?wd=python'
    proxy_url = proxy
    proxy_support = urllib2.ProxyHandler({'http': proxy_url}) #设定使用proxy
    opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
    r=urllib2.Request(url)
    #r.add_header("Accept-Language","utf-8") #加入头信息,这样可避免403错误
    #r.add_header("Content-Type","text/html; charset=utf-8")
    #r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)")
    trycount=1
    while trycount<=2: #尝试2次
        try:
            T0=time.time() #开始时间
            f=urllib2.urlopen(r,timeout=2)
            data=f.read()
            if '百度搜索' in data and 'https://verify.baidu.com' not in data: #判断到网页信息中有baidu字符,说明通过此proxy连接baidu可以成功
                T=time.time()-T0 #得出最终连接所用时间
                break
            else:return []	
        except:
            time.sleep(1)
            trycount=trycount+1
    if trycount>2:
        return []
    else:
        print '地址:'+proxy+' 连接速度:'+str(T) #此信息会写入一个文本文件
        return proxy

def daili_ip(dailistr):
	daili_list = dailistr.split(',')
	ip = random.choice(daili_list)
	return ip

def getHtml(url,headers,ip):
    while 1:
        try:
            c = pycurl.Curl()
            c.setopt(pycurl.MAXREDIRS,5)
            c.setopt(pycurl.REFERER, url)
            c.setopt(pycurl.FOLLOWLOCATION, True)
            c.setopt(pycurl.CONNECTTIMEOUT, 60)
            c.setopt(pycurl.TIMEOUT,120)
            c.setopt(pycurl.ENCODING, 'gzip,deflate')
            c.setopt(pycurl.USERAGENT,headers)
            c.setopt(c.PROXY,ip)
            c.fp = StringIO.StringIO()
            c.setopt(pycurl.URL, url)
            #c.setopt(pycurl.HTTPHEADER,["Accept-Encoding:gzip,deflate,sdch"])
            #c.setopt(pycurl.HTTPHEADER,header_list)
            c.setopt(c.WRITEFUNCTION, c.fp.write)
            c.perform()
            #code = c.getinfo(c.HTTP_CODE)  返回状态码
            html = c.fp.getvalue()
            return html
        except:
            continue

for ip in daili_list:
	ceshi_ip_a = proxycheckone(ip)
	daili_list_str.append(ceshi_ip_a)
daili_str = ','.join(daili_list_str)

for url in open('url.txt'):
    content = getHtml(url,getHead(),daili_ip(dailistr))
    typeEncode = sys.getfilesystemencoding()
    infoencode = chardet.detect(content).get('encoding','utf-8')
    html = content.decode(infoencode,'ignore').encode(typeEncode)
    print html


#---下载图片---
import urllib,os

filepath=os.getcwd()
if os.path.exists(filepath) is False:  
    os.mkdir(filepath)  
x=1  
print u'爬虫准备就绪...'  
for line in open('logo_url.txt'):
    line = line.strip()
    id = line.split(',')[1]
    imgurl = line.split(',')[2]
    temp= '%s.jpg' % id  
    print u'正在下载第%s张图片' % x  
    print imgurl  
    try:
        urllib.urlretrieve(imgurl,temp)  
        x+=1
    except:
        continue
print u'图片下载完毕，保存路径为'+filepath

来自于闯哥www.kaopuseo.com

相关文章:

Leave a Comment 取消回复