Advertisement
Guest User

Untitled

a guest
May 27th, 2015
248
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.88 KB | None | 0 0
  1. package classifier
  2.  
  3. import regexp "regexp"
  4. import snowball "github.com/kljensen/snowball"
  5. import bayesian "github.com/jbrukh/bayesian"
  6.  
  7.  
  8. func tokenize(text string) ([]string) {
  9.  
  10. tokenizeRegexp := regexp.MustCompile("[^a-zA-Zа-яА-Я-]+")
  11.  
  12. words := tokenizeRegexp.Split(text, -1)
  13.  
  14. return words
  15. }
  16.  
  17.  
  18. func guessLanguage(text string) (string) {
  19.  
  20. cyrillicRegexp := regexp.MustCompile("[а-яА-Я]")
  21.  
  22. if cyrillicRegexp.MatchString(text) {
  23. return "russian"
  24. }
  25.  
  26. return "english"
  27. }
  28.  
  29.  
  30. func stem(language string, words []string) ([]string) {
  31.  
  32. stemmedWords := make([]string, 0)
  33.  
  34. for _, word := range words {
  35.  
  36. stemmedWord, _ := snowball.Stem(word, language, true);
  37.  
  38. if stemmedWord == "" {
  39. continue
  40. }
  41.  
  42. stemmedWords = append(stemmedWords, stemmedWord)
  43. }
  44.  
  45. return stemmedWords
  46. }
  47.  
  48.  
  49. type Classifier struct {
  50. bayesian *bayesian.Classifier
  51. classes []string
  52. }
  53.  
  54.  
  55. func New(classes []string) Classifier {
  56.  
  57. bayesianClasses := make([]bayesian.Class, len(classes))
  58.  
  59. for i, word := range classes {
  60. bayesianClasses[i] = bayesian.Class(word)
  61. }
  62.  
  63. bayesianClassifier := bayesian.NewClassifier(bayesianClasses...)
  64.  
  65. classifier := Classifier {
  66. bayesian: bayesianClassifier,
  67. classes: classes,
  68. }
  69.  
  70. return classifier
  71. }
  72.  
  73.  
  74. func (classifier *Classifier) Learn(text string, class string) {
  75. lang := guessLanguage(text)
  76. words := tokenize(text)
  77. stemmedWords := stem(lang, words);
  78.  
  79. classifier.bayesian.Learn(stemmedWords, bayesian.Class(class));
  80. }
  81.  
  82.  
  83. func (classifier *Classifier) Classify(text string) ([]string) {
  84. lang := guessLanguage(text)
  85. words := tokenize(text)
  86. stemmedWords := stem(lang, words);
  87.  
  88. // ~ probs, likely, strict := classifier.Bayesian.LogScores(stemmedWords)
  89. _, likely, _ := classifier.bayesian.LogScores(stemmedWords)
  90.  
  91. classes := make([]string, 1)
  92.  
  93. classes[0] = classifier.classes[likely]
  94.  
  95. // TODO: return classes, sorted by probs[i] != -Inf for strict != true
  96.  
  97. return classes
  98. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement