Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package classifier
- import regexp "regexp"
- import snowball "github.com/kljensen/snowball"
- import bayesian "github.com/jbrukh/bayesian"
- func tokenize(text string) ([]string) {
- tokenizeRegexp := regexp.MustCompile("[^a-zA-Zа-яА-Я-]+")
- words := tokenizeRegexp.Split(text, -1)
- return words
- }
- func guessLanguage(text string) (string) {
- cyrillicRegexp := regexp.MustCompile("[а-яА-Я]")
- if cyrillicRegexp.MatchString(text) {
- return "russian"
- }
- return "english"
- }
- func stem(language string, words []string) ([]string) {
- stemmedWords := make([]string, 0)
- for _, word := range words {
- stemmedWord, _ := snowball.Stem(word, language, true);
- if stemmedWord == "" {
- continue
- }
- stemmedWords = append(stemmedWords, stemmedWord)
- }
- return stemmedWords
- }
- type Classifier struct {
- bayesian *bayesian.Classifier
- classes []string
- }
- func New(classes []string) Classifier {
- bayesianClasses := make([]bayesian.Class, len(classes))
- for i, word := range classes {
- bayesianClasses[i] = bayesian.Class(word)
- }
- bayesianClassifier := bayesian.NewClassifier(bayesianClasses...)
- classifier := Classifier {
- bayesian: bayesianClassifier,
- classes: classes,
- }
- return classifier
- }
- func (classifier *Classifier) Learn(text string, class string) {
- lang := guessLanguage(text)
- words := tokenize(text)
- stemmedWords := stem(lang, words);
- classifier.bayesian.Learn(stemmedWords, bayesian.Class(class));
- }
- func (classifier *Classifier) Classify(text string) ([]string) {
- lang := guessLanguage(text)
- words := tokenize(text)
- stemmedWords := stem(lang, words);
- // ~ probs, likely, strict := classifier.Bayesian.LogScores(stemmedWords)
- _, likely, _ := classifier.bayesian.LogScores(stemmedWords)
- classes := make([]string, 1)
- classes[0] = classifier.classes[likely]
- // TODO: return classes, sorted by probs[i] != -Inf for strict != true
- return classes
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement