leka v0.1

#!/bin/bash
# leka, mallet interface script for topic modeling finnish
# tuukka.yla-anttila@helsinki.fi
# syntax: leka numberoftopics inputdir stopwordlist1 stopwordlist2

# set vars

inputfiles=$(find $2 -type f)
inputfilenames=$(ls $2)
stopwordsA=$(cat $3)
stopwordsB=$(cat $4)
timestamp=$(date "+%d.%m.%Y-%H%M%S")

# create backup dir

mkdir -p backups-$timestamp

# create backups

for file in $inputfiles
do

echo
echo "BACKING UP" $file
cp -R $file backups-$timestamp/

# remove stopwords

    for stopwordA in $stopwordsA
    do
    echo "REMOVING STOPWORD" $stopwordA "from" $file
    grep -v "$stopwordA" $file > $file-2; mv $file-2 $file
    done

    for stopwordB in $stopwordsB
    do
    echo "REMOVING STOPWORD" $stopwordB "from" $file
    grep -v "$stopwordB" $file > $file-2; mv $file-2 $file
    done
done

# create project directory

echo "CREATING OUTPUT DIRECTORY" output$timestamp
mkdir -p output$timestamp

# stem words

for file in $inputfilenames
do
    echo
    echo "STEMMING $file to output$timestamp/$file-stemmed"
    echo
    ./stemwords -l finnish -i $2/$file -o output$timestamp/$file-stemmed
done

# create mallet file

echo "CREATING MALLET FILE" output$timestamp/project.mallet
./mallet import-dir --input output$timestamp --output output$timestamp/project.mallet --keep-sequence

# train topics

echo "TOPIC MODELLING"

./mallet train-topics --input output$timestamp/project.mallet --num-topics $1 --optimize-interval $1 --output-state output$timestamp/words.gz --output-topic-keys output$timestamp/topickeys.txt --output-doc-topics output$timestamp/index.txt --xml-topic-report output$timestamp/index.xml

echo "TOPIC MODELING OUTPUT WRITTEN in folder output$timestamp"