Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- module Crawler
- class WebCrawler
- def start(start_url)
- @urls_to_crawl.write([:url, URI(start_url)])
- crawl do |doc|
- yield doc
- end
- end
- private
- def crawl
- loop do
- url = @urls_to_crawl.take([:url, nil])[1]
- @urls_status[url.to_s] = true
- doc = download_resource(url) do |file|
- Hpricot(file)
- end or next
- yield doc
- time_begin = Time.now
- add_new_urls(extract_urls(doc, url))
- AccessDb('data.dta')
- puts "Elapsed: #{Time.now - time_begin}"
- end
- end
- end
- class AccessDb
- attr_accessor :mdb, :connection, :data, :fields
- def initialize(mdb=nil)
- @mdb = mdb
- @connection = nil
- @data = nil
- @fields = nil
- end
- def open
- connection_string = 'Provider=Microsoft.Jet.OLEDB.4.0;Data Source='
- connection_string << @mdb
- @connection = WIN32OLE.new('ADODB.Connection')
- @connection.Open(connection_string)
- end
- def query(sql)
- recordset = WIN32OLE.new('ADODB.Recordset')
- recordset.Open(sql, @connection)
- @fields = []
- recordset.Fields.each do |field|
- @fields << field.Name
- end
- begin
- @data = recordset.GetRows.transpose
- rescue
- @data = []
- end
- recordset.Close
- end
- def execute(sql)
- @connection.Execute(sql)
- end
- def close
- @connection.Close
- end
- end
- end
Add Comment
Please, Sign In to add comment