2009年4月22日星期三

用ruby写的一个网络爬虫程序

前几天写的一个ruby爬虫,专抓指定网站的图片
require 'net/http'
require "monitor"
def query_url(url)
  return Net::HTTP.get(URI.parse(url));
end
def save_url(url,dir,filename)
  filename = url[url.rindex('/')+1, url.length-1] if filename == nil || filename.empty?
  require 'open-uri'
  Dir.mkdir("#{dir}") if dir != nil && !dir.empty? && !FileTest.exist?(dir)
  open(url) do |fin|
      File.new("#{dir}#{filename}","wb").close
      open("#{dir}#{filename}","wb") do |fout|
        while buf = fin.read(1024) do
          fout.write buf
          STDOUT.flush
        end
      end
  end
end
def download_page(content)
  content.scan(/<[Ii][Mm][Gg].* src="\S+[^ni]."/) {|match|
    match.scan(/http:\/\/\S+"/){|img|
      img=img.gsub(/"/,'')
      puts "img:"+img
      begin
        save_url(img,"E:\\TET\\",nil)
      rescue =>e
        puts e
      ensure
        next
      end
    }
  }
end
begin
  start_url = 'http://se.1ssdd.com/'
  print "开始搜索#{start_url}\n"
  content = query_url(start_url)
  next_host = "http://se.1ssdd.com/"
  threads=[]
  i=1
  content.scan(/<a href="\/(.*?\.html)"/) {|match|
    if  !match.nil? && match.size>0
      threads<<Thread.new(match) do |urls|
        urls.each{|url|
          next_url =next_host+url
          puts next_url+"::"+i.to_s
          page=query_url(next_url)
          download_page(page)
        }
      end
    end
  }
  threads.each{|thr| thr.join}
  download_page(content)
  p "over"
end

没有评论:

发表评论