
Untitled
By: a guest on
Jul 20th, 2012 | syntax:
None | size: 1.38 KB | hits: 8 | expires: Never
Identifying top recurring words from a list of e-mails based on a dictionary of interesting words
find -type f | parallel --tag 'eml-to-text {} | grep -o -n -b -f /tmp/list_of_interesting_words'
find . -type f | parallel 'eml-to-text {} >/tmp/unpacked/{#}'
find /tmp/unpacked -type f | parallel -X grep -H -o -n -b -f /tmp/list_of_interesting_words
cat /tmp/list_of_interesting_words | parallel --pipe --block 10k --files > /tmp/blocks_of_words
find /tmp/unpacked -type f | parallel -j1 -I ,, parallel --arg-file-sep // -X grep -H -o -n -b -f ,, {} // - :::: /tmp/blocks_of_words
... | sort -k4 -t: > index.by.word
... | sort -k4 -t: | tee index.by.word | awk 'FS=":" {print $4}' | uniq -c
find . -type f | parallel --tag 'eml-to-text {} | grep -F -w -o -n -b -f /tmp/list_of_interesting_words' | sort -k3 -t: | tee index.by.word | awk 'FS=":" {print $3}' | uniq -c
result <- empty list
for each email e:
for each word w:
if is_interesting_word(w, string_data_structure):
add (filename, line_number, start_position, word) to results
list = ['a', 'bunch', 'of', 'interesting', 'words']
linepos = 0
with open("file") as f:
for line in f:
linepos += 1
wordpos = 0
for word in line.split():
wordpos += 1
if word in list:
print "%s found at line %s, word %s" % (word, linepos, wordpos)