Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env ruby
- DEBUG = false
- @start = Time.now if DEBUG
- ##############################################################
- # Setup environment
- ##############################################################
- if File.directory?( File.join(File.dirname(__FILE__), "framework"))
- $:.unshift('framework')
- require File.join(File.dirname(__FILE__), "framework/merb")
- else
- require 'merb'
- end
- MERB_ROOT = File.dirname(__FILE__)
- RAILS_ROOT = MERB_ROOT
- RAILS_ENV = MERB_ENV
- # Get Merb plugins and dependencies
- require File.dirname(__FILE__)+'/config/dependencies.rb'
- ##############################################################
- # Loading Only The Needed Dependencies
- ##############################################################
- require File.join(MERB_ROOT, 'lib', 'related_media.rb')
- require File.join(MERB_ROOT, 'lib', 'cci', 'cci_parser.rb')
- require File.join(MERB_ROOT, 'lib', 'cci', 'keywords.rb')
- require File.join(MERB_ROOT, 'app', 'models', 'story')
- require File.join(MERB_ROOT, 'app', 'models', 'cut')
- require File.join(MERB_ROOT, 'app', 'models', 'keywords')
- require File.join(MERB_ROOT, 'app', 'models', 'raw_cci')
- require File.join(MERB_ROOT, 'app', 'models', 'category')
- require File.join(MERB_ROOT, 'deps', 'plugins', 'acts_as_solr', 'init')
- require 'net/ftp'
- include Net
- ##############################################################
- # Setup Constants and other misc. info
- ##############################################################
- #Setup FTP Info
- #FTP_SERVER = 'wire2.wieck.com'
- #FTP_USER = 'nytpull'
- #FTP_PASSWORD = 't0rnad0'
- FTP_SERVER = 'localhost'
- FTP_USER = 'wieck'
- FTP_PASSWORD = 'hurr1cane'
- FTP_DIRECTORY = "WIRE_STORAGE"
- #Create the tmp directory unless it doesn't exist
- FileUtils.mkdir_p(File.join(File.dirname(__FILE__), "tmp")) unless File.exists?(File.join(__FILE__, "tmp"))
- #Create our last import timestamp file with the current time if it doesn't exist
- # We can run a parser through previous files later if there are other files we haven't imported yet.
- unless File.exists?(File.join(File.dirname(__FILE__), "tmp", "last_cci_import"))
- `echo '#{Time.now.to_s}' > #{File.join(File.dirname(__FILE__), "tmp", "last_cci_import")}`
- end
- #Parse out the time of the last import from the file
- LAST_IMPORT = Time.parse(`cat #{File.join(File.dirname(__FILE__), "tmp", "last_cci_import")}`.chomp!)
- #This is used to quickly distinguish our raw wire files
- FILE_REGEX = /^nytns.*$/
- #The Date range is the indexes of the split ftp ls string
- #We use this later to find when the file was modified, and also where the file name begins
- DATE_RANGE = [5, 7]
- ##############################################################
- # Start the party
- ##############################################################
- #Recursive method to loop through directories and find parseable stories
- def process_files(ftp, ls_output, count)
- #Get rid of the first line of the output. It's worthless shat
- ls_output.delete_at(0)
- #Loop through each listing
- ls_output.each do |listing|
- #Grab the remote path. Anything after the last date item in the array is a path
- #This handles spaces
- remote_path = listing.split[(DATE_RANGE[1] + 1)..-1].join(" ")
- #Check to see if it's a directory or not...
- if listing.split('')[0] == 'd'
- puts "#{'*' * count} dir: #{remote_path}" if DEBUG
- #Change into the directory
- ftp.chdir(remote_path)
- #Recursion FTW! Go into the directory and look for files.
- process_files(ftp, ftp.ls, count + 1)
- else
- #Check to make sure these are the files we want...
- if remote_path =~ FILE_REGEX
- #Setup the path we want to fetch the files to
- local_path = File.join(File.dirname(__FILE__), "tmp", remote_path)
- puts "#{'*' * count} CCI file: #{remote_path}" if DEBUG
- puts "#{'*' * count} Checking update time..." if DEBUG
- #Parse the updated time from the current listing using the date range constant
- updated_time = Time.parse(listing.split[DATE_RANGE[0]..DATE_RANGE[1]].join(" "))
- #Make sure this is a _new_ file...
- if updated_time > LAST_IMPORT
- puts "#{'*' * count} Fetching file..." if DEBUG
- #Fetch the file...
- ftp.get(remote_path, local_path)
- puts "#{'*' * count} Processing received file"
- #Crunch the content through the parser...
- parser = CCIParser.new(local_path)
- story = parser.parse
- story.save
- #Delete the local file.
- FileUtils.rm(local_path)
- else
- puts "#{'*' * count} Skipping stale file..." if DEBUG
- end # end if updated_time > LAST_IMPORT
- end # end if remote_path =~ FILE_REGEX
- end # end if listing.split('')[0] == 'd'
- end # end ls_output.each do |listing|
- #After this episode of recursion, go back a directory to setup for the next iteration
- #Basically, we're unwinding here...
- ftp.chdir("..")
- end
- #Open the connection...
- FTP.open(FTP_SERVER) do |ftp|
- #Login to the server...
- ftp.login(FTP_USER, FTP_PASSWORD)
- #Move into the initial directory
- ftp.chdir(FTP_DIRECTORY)
- #Start the loop!
- process_files(ftp, ftp.ls, 1)
- end
- #Update the timestamp of the last record process
- `echo '#{Time.now.to_s}' > #{File.join(File.dirname(__FILE__), "tmp", "last_cci_import")}`
- @finish = Time.now if DEBUG
- puts "Script took #{(@finish - @start).to_s} seconds" if DEBUG
Add Comment
Please, Sign In to add comment