Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # leka, mallet interface script for topic modeling finnish
- # tuukka.yla-anttila@helsinki.fi
- # syntax: leka numberoftopics inputdir stopwordlist1 stopwordlist2
- # set vars
- inputfiles=$(find $2 -type f)
- inputfilenames=$(ls $2)
- stopwordsA=$(cat $3)
- stopwordsB=$(cat $4)
- timestamp=$(date "+%d.%m.%Y-%H%M%S")
- # create backup dir
- mkdir -p backups-$timestamp
- # create backups
- for file in $inputfiles
- do
- echo
- echo "BACKING UP" $file
- cp -R $file backups-$timestamp/
- # remove stopwords
- for stopwordA in $stopwordsA
- do
- echo "REMOVING STOPWORD" $stopwordA "from" $file
- grep -v "$stopwordA" $file > $file-2; mv $file-2 $file
- done
- for stopwordB in $stopwordsB
- do
- echo "REMOVING STOPWORD" $stopwordB "from" $file
- grep -v "$stopwordB" $file > $file-2; mv $file-2 $file
- done
- done
- # create project directory
- echo "CREATING OUTPUT DIRECTORY" output$timestamp
- mkdir -p output$timestamp
- # stem words
- for file in $inputfilenames
- do
- echo
- echo "STEMMING $file to output$timestamp/$file-stemmed"
- echo
- ./stemwords -l finnish -i $2/$file -o output$timestamp/$file-stemmed
- done
- # create mallet file
- echo "CREATING MALLET FILE" output$timestamp/project.mallet
- ./mallet import-dir --input output$timestamp --output output$timestamp/project.mallet --keep-sequence
- # train topics
- echo "TOPIC MODELLING"
- ./mallet train-topics --input output$timestamp/project.mallet --num-topics $1 --optimize-interval $1 --output-state output$timestamp/words.gz --output-topic-keys output$timestamp/topickeys.txt --output-doc-topics output$timestamp/index.txt --xml-topic-report output$timestamp/index.xml
- echo "TOPIC MODELING OUTPUT WRITTEN in folder output$timestamp"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement