Advertisement
Guest User

Untitled

a guest
Jan 7th, 2011
169
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 0.62 KB | None | 0 0
  1. #!/bin/bash
  2.  
  3. # Create a tally of all the words in the corpus.
  4. #
  5. echo Creating tally of word frequencies...
  6. sed -e 's/ /\n/g' -e 's/[^a-zA-Z\n]//g' corpus.txt | \
  7.   tr [:upper:] [:lower:] | \
  8.   sort | \
  9.   uniq -c | \
  10.   sort -rn > frequency.txt
  11.  
  12. echo Creating corpus lexicon...
  13. rm -f corpus-lexicon.txt
  14.  
  15. for i in $(awk '{if( $2 ) print $2}' frequency.txt); do
  16.   grep -m 1 ^$i\$ dictionary.txt >> corpus-lexicon.txt;
  17. done
  18.  
  19. echo Creating lexicon...
  20. rm -f lexicon.txt
  21.  
  22. for i in $(cat corpus-lexicon.txt); do
  23.   egrep -m 1 "^[0-9 ]* $i\$" frequency.txt | \
  24.     awk '{print $2, $1}' | \
  25.     tr ' ' ',' >> lexicon.txt;
  26. done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement