Advertisement
Guest User

Untitled

a guest
Aug 29th, 2016
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.10 KB | None | 0 0
  1. require 'nokogiri'
  2. require 'pg'
  3. require 'json'
  4. require 'parallel'
  5. require 'timeout'
  6.  
  7. def score( array )
  8. array.each_with_object(Hash.new(0)){|key,hash| hash[key] += 1}
  9. end
  10.  
  11.  
  12. pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
  13. results = pg_client.query("select filename from form_8k_ngrams where n2_grams is null and random()<0.1").to_a
  14. pg_client.close
  15.  
  16. Parallel.each_with_index(results) do |result_row, idx|
  17.  
  18. original_filename = result_row['filename']
  19. filename = original_filename.split('/').last
  20. filename_glob = filename.split('.')[0..-2].join('')
  21.  
  22. expected_file_path = "/path/to/forms/8K/#{filename}"
  23.  
  24. if File.exist?(expected_file_path) && File.size?(expected_file_path) && File.size?(expected_file_path)> 0
  25. path_to_actual_file = expected_file_path
  26. puts 'file exists'
  27. else
  28. path_to_actual_file = Dir.glob("/path/to/forms/8K/#{filename_glob}*").first
  29. puts 'could not find file'
  30. end
  31.  
  32. next if path_to_actual_file.nil?
  33.  
  34. begin
  35. Timeout::timeout(30) do
  36. doc = open(path_to_actual_file).read
  37. # cik = doc.match(/CENTRAL INDEX KEY.+/)[0].split(/\s+/).last
  38. begin
  39. ad_match = doc.match('ACCEPTANCE-DATETIME>(.+)')
  40. rescue
  41. next
  42. end
  43. next if ad_match.nil?
  44. acceptance_datetime = ad_match[1]
  45.  
  46. no_images = doc.split('<TEXT>').reject{|fragment|fragment.include?'begin 644'}[1..-1].join(' ')
  47. clean_text = Nokogiri::HTML(no_images).text.split('').map{|x| if x.ord==160; ' '; else; x; end }.join('').gsub(/\s+/mx,' ').gsub(/“|”|’/,"'").downcase.gsub(/[:;.]/, '')
  48.  
  49.  
  50. stopwords = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","d","ll","m","o","re","ve","y","ain","aren","couldn","didn","doesn","hadn","hasn","haven","isn","ma","mightn","mustn","needn","shan","shouldn","wasn","weren","won","wouldn"]
  51.  
  52. no_stopwords_text = clean_text.split(' ').reject{|word| stopwords.include?(word) }.join(' ')
  53.  
  54. n_grams = {}
  55. [2,3,4].each do |n|
  56. n_grams[n] = score(no_stopwords_text.split(' ').each_cons(n).to_a).sort_by {|_key, value| value}.to_h.to_json
  57. end
  58.  
  59. pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
  60. pg_client.prepare("f8k_ngram_statement#{idx}", "update form_8k_ngrams set n2_grams=$1,n3_grams=$2,n4_grams=$3,acceptance_datetime=$4 where filename=$5")
  61. pg_client.exec_prepared("f8k_ngram_statement#{idx}", [ n_grams[2],n_grams[3],n_grams[4],acceptance_datetime,original_filename] )
  62. pg_client.close
  63.  
  64. puts "success! #{idx}"
  65. end
  66. rescue Timeout::Error
  67. puts "skipped #{filename} because too slow"
  68. end
  69. end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement