Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Linq;
- using System.Collections.Generic;
- using System.Text.RegularExpressions;
- using Sentimentinator.Interfaces;
- namespace Sentimentinator
- {
- class Tokenizer : ITokenizer
- {
- // The regex is carefully constructed as is, so if change needed: be very wary of side-effects.
- private string defaultSentenceSplit = @"(?<! adm| adr| afd| alm| ang| bl|\.a| bla| opg| org| dr| dvs| eks| fx| fhv| hhv| i\.e| ifl| ifb| hr| frk| jf| jvf|www| m| mio| ml| mr| mrs| pga)\.(?!\.|[0-9]|com|eks|dk|e\.|\?|\!)|\?(?!\?|\.|\!)|\!(?!\!|\.|\?)";
- //default regex constructor
- public Tokenizer()
- {
- }
- //constructor that makes user able to change default sentence splitter with own regex
- public Tokenizer(string regexSentenceSplit)
- {
- defaultSentenceSplit = regexSentenceSplit;
- }
- /*
- * Takes a list of raw comments and converts them into a list of TokenizedComments
- * Param: List of raw comments
- * Returns: List<TokenizedComment>
- */
- public List<TokenizedComment> TokenizeComments(List<string> comments)
- {
- var tokenizedComments = new List<TokenizedComment>();
- foreach (string comment in comments) {
- tokenizedComments.Add(TokenizeComment(comment));
- }
- return tokenizedComments;
- }
- /*
- Takes a raw comment and splits it into sentences, which are then tokenized
- Param: A raw comment
- Returns: TokenizedComment
- */
- public TokenizedComment TokenizeComment(string comment)
- {
- var sentences = SplitComment(comment);
- var tokenComment = new List<List<string>>();
- foreach (string sentence in sentences){
- var tokens = TokenizeSentence(sentence);
- tokenComment.Add(tokens);
- }
- return new TokenizedComment(tokenComment);
- }
- /*
- * Splits a sentence into tokens and filters out empty
- */
- private List<string> TokenizeSentence(string sentence)
- {
- var tokens = new List<string>(sentence.Split(" "));
- tokens = tokens.Where(x => x != "").ToList<string>();
- return tokens;
- }
- /*
- Takes a string and seperates at [.?!] and tries to avoid splitting on popular abbreviations
- */
- private List<string> SplitComment(string s)
- {
- var list = new List<string>();
- //replaces newlines with a punct aswell as adds a punct if comment does not have end punct.
- s = s.Replace(Environment.NewLine, ".").ToLower() + ".";
- var sentences = Regex.Split(s, @defaultSentenceSplit);
- foreach (string sentence in sentences)
- list.Add(RemoveNonLetters(sentence).Trim());
- return list;
- }
- /*
- * Removes non-letters
- * Side-effect: Removes !? and emojies
- */
- private string RemoveNonLetters(string s)
- {
- var rgx = new Regex(@"[^a-zæøå\s:]");
- s = rgx.Replace(s,"");
- return s;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement