require 'net/http'
require "monitor"
def query_url(url)
return Net::HTTP.get(URI.parse(url));
end
def save_url(url,dir,filename)
filename = url[url.rindex('/')+1, url.length-1] if filename == nil || filename.empty?
require 'open-uri'
Dir.mkdir("#{dir}") if dir != nil && !dir.empty? && !FileTest.exist?(dir)
open(url) do |fin|
File.new("#{dir}#{filename}","wb").close
open("#{dir}#{filename}","wb") do |fout|
while buf = fin.read(1024) do
fout.write buf
STDOUT.flush
end
end
end
end
def download_page(content)
content.scan(/<[Ii][Mm][Gg].* src="\S+[^ni]."/) {|match|
match.scan(/http:\/\/\S+"/){|img|
img=img.gsub(/"/,'')
puts "img:"+img
begin
save_url(img,"E:\\TET\\",nil)
rescue =>e
puts e
ensure
next
end
}
}
end
begin
start_url = 'http://se.1ssdd.com/'
print "开始搜索#{start_url}\n"
content = query_url(start_url)
next_host = "http://se.1ssdd.com/"
threads=[]
i=1
content.scan(/<a href="\/(.*?\.html)"/) {|match|
if !match.nil? && match.size>0
threads<<Thread.new(match) do |urls|
urls.each{|url|
next_url =next_host+url
puts next_url+"::"+i.to_s
page=query_url(next_url)
download_page(page)
}
end
end
}
threads.each{|thr| thr.join}
download_page(content)
p "over"
end
2009年4月22日星期三
用ruby写的一个网络爬虫程序
前几天写的一个ruby爬虫,专抓指定网站的图片
订阅:
博文评论 (Atom)
没有评论:
发表评论