Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require 'nokogiri'
- require 'pg'
- require 'json'
- require 'parallel'
- require 'timeout'
- def score( array )
- array.each_with_object(Hash.new(0)){|key,hash| hash[key] += 1}
- end
- pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
- results = pg_client.query("select filename from form_8k_ngrams where n2_grams is null and random()<0.1").to_a
- pg_client.close
- Parallel.each_with_index(results) do |result_row, idx|
- original_filename = result_row['filename']
- filename = original_filename.split('/').last
- filename_glob = filename.split('.')[0..-2].join('')
- expected_file_path = "/path/to/forms/8K/#{filename}"
- if File.exist?(expected_file_path) && File.size?(expected_file_path) && File.size?(expected_file_path)> 0
- path_to_actual_file = expected_file_path
- puts 'file exists'
- else
- path_to_actual_file = Dir.glob("/path/to/forms/8K/#{filename_glob}*").first
- puts 'could not find file'
- end
- next if path_to_actual_file.nil?
- begin
- Timeout::timeout(30) do
- doc = open(path_to_actual_file).read
- # cik = doc.match(/CENTRAL INDEX KEY.+/)[0].split(/\s+/).last
- begin
- ad_match = doc.match('ACCEPTANCE-DATETIME>(.+)')
- rescue
- next
- end
- next if ad_match.nil?
- acceptance_datetime = ad_match[1]
- no_images = doc.split('<TEXT>').reject{|fragment|fragment.include?'begin 644'}[1..-1].join(' ')
- clean_text = Nokogiri::HTML(no_images).text.split('').map{|x| if x.ord==160; ' '; else; x; end }.join('').gsub(/\s+/mx,' ').gsub(/“|”|’/,"'").downcase.gsub(/[:;.]/, '')
- stopwords = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","d","ll","m","o","re","ve","y","ain","aren","couldn","didn","doesn","hadn","hasn","haven","isn","ma","mightn","mustn","needn","shan","shouldn","wasn","weren","won","wouldn"]
- no_stopwords_text = clean_text.split(' ').reject{|word| stopwords.include?(word) }.join(' ')
- n_grams = {}
- [2,3,4].each do |n|
- n_grams[n] = score(no_stopwords_text.split(' ').each_cons(n).to_a).sort_by {|_key, value| value}.to_h.to_json
- end
- pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
- pg_client.prepare("f8k_ngram_statement#{idx}", "update form_8k_ngrams set n2_grams=$1,n3_grams=$2,n4_grams=$3,acceptance_datetime=$4 where filename=$5")
- pg_client.exec_prepared("f8k_ngram_statement#{idx}", [ n_grams[2],n_grams[3],n_grams[4],acceptance_datetime,original_filename] )
- pg_client.close
- puts "success! #{idx}"
- end
- rescue Timeout::Error
- puts "skipped #{filename} because too slow"
- end
- end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement