python 采集小程序

分类:CentOS运维 阅读:77404 次

# coding=utf8
#LINUXQQ for crawler data v0.1
import os
import re
import urllib

videourl = ‘http://www.centoscn.com/’
rootdir = ‘D:\\video\\’

def progress(blocknum,blocksize,totalsize):
per = 100.0 * blocknum * blocksize / totalsize
if per > 100:
per = 100
print “%.2f%%”% per

def contact(link,directory):
newlink = link.replace(‘&’,'&’)
newhtml = urllib.urlopen(str(videourl + newlink))
newdata = re.compile(‘ req = re.search(newdata,newhtml.read())
if req:
os.mkdir(rootdir + directory )
download = req.group(1).replace('html','swf')
if urllib.urlretrieve(download, rootdir + directory + '\\study.swf',progress):
print directory + 'download ok'
else:
print directory + 'download failure'

def crawler(url):
html = urllib.urlopen('http://www.51zxw.net/list.aspx?cid=359')
data = re.compile(r' ]*?>(.*?)‘, re.S|re.U)
req = re.findall(data,html.read())
for i in req:
contact(i[0],i[1])

if __name__==’__main__’:
i = 8
p = 1
while p <= i:
url = ‘http://www.51zxw.net/list.aspx?page=%d&cid=359′ % (p)
p += 1
crawler(url)