Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # Create a tally of all the words in the corpus.
- #
- echo Creating tally of word frequencies...
- sed -e 's/ /\n/g' -e 's/[^a-zA-Z\n]//g' corpus.txt | \
- tr [:upper:] [:lower:] | \
- sort | \
- uniq -c | \
- sort -rn > frequency.txt
- echo Creating corpus lexicon...
- rm -f corpus-lexicon.txt
- for i in $(awk '{if( $2 ) print $2}' frequency.txt); do
- grep -m 1 ^$i\$ dictionary.txt >> corpus-lexicon.txt;
- done
- echo Creating lexicon...
- rm -f lexicon.txt
- for i in $(cat corpus-lexicon.txt); do
- egrep -m 1 "^[0-9 ]* $i\$" frequency.txt | \
- awk '{print $2, $1}' | \
- tr ' ' ',' >> lexicon.txt;
- done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement