Advertisement
Guest User

leka v0.1

a guest
Oct 23rd, 2014
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 1.70 KB | None | 0 0
  1. #!/bin/bash
  2. # leka, mallet interface script for topic modeling finnish
  3. # tuukka.yla-anttila@helsinki.fi
  4. # syntax: leka numberoftopics inputdir stopwordlist1 stopwordlist2
  5.  
  6. # set vars
  7.  
  8. inputfiles=$(find $2 -type f)
  9. inputfilenames=$(ls $2)
  10. stopwordsA=$(cat $3)
  11. stopwordsB=$(cat $4)
  12. timestamp=$(date "+%d.%m.%Y-%H%M%S")
  13.  
  14. # create backup dir
  15.  
  16. mkdir -p backups-$timestamp
  17.  
  18. # create backups
  19.  
  20. for file in $inputfiles
  21. do
  22.  
  23. echo
  24. echo "BACKING UP" $file
  25. cp -R $file backups-$timestamp/
  26.  
  27. # remove stopwords
  28.  
  29.     for stopwordA in $stopwordsA
  30.     do
  31.     echo "REMOVING STOPWORD" $stopwordA "from" $file
  32.     grep -v "$stopwordA" $file > $file-2; mv $file-2 $file
  33.     done
  34.  
  35.     for stopwordB in $stopwordsB
  36.     do
  37.     echo "REMOVING STOPWORD" $stopwordB "from" $file
  38.     grep -v "$stopwordB" $file > $file-2; mv $file-2 $file
  39.     done
  40. done
  41.  
  42. # create project directory
  43.  
  44. echo "CREATING OUTPUT DIRECTORY" output$timestamp
  45. mkdir -p output$timestamp
  46.  
  47. # stem words
  48.  
  49. for file in $inputfilenames
  50. do
  51.     echo
  52.     echo "STEMMING $file to output$timestamp/$file-stemmed"
  53.     echo
  54.     ./stemwords -l finnish -i $2/$file -o output$timestamp/$file-stemmed
  55. done
  56.  
  57. # create mallet file
  58.  
  59. echo "CREATING MALLET FILE" output$timestamp/project.mallet
  60. ./mallet import-dir --input output$timestamp --output output$timestamp/project.mallet --keep-sequence
  61.  
  62. # train topics
  63.  
  64. echo "TOPIC MODELLING"
  65.  
  66. ./mallet train-topics --input output$timestamp/project.mallet --num-topics $1 --optimize-interval $1 --output-state output$timestamp/words.gz --output-topic-keys output$timestamp/topickeys.txt --output-doc-topics output$timestamp/index.txt --xml-topic-report output$timestamp/index.xml
  67.  
  68. echo "TOPIC MODELING OUTPUT WRITTEN in folder output$timestamp"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement