Advertisement
Guest User

Untitled

a guest
Feb 20th, 2019
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 3.28 KB | None | 0 0
  1. using System;
  2. using System.Linq;
  3. using System.Collections.Generic;
  4. using System.Text.RegularExpressions;
  5. using Sentimentinator.Interfaces;
  6.  
  7. namespace Sentimentinator
  8. {
  9.     class Tokenizer : ITokenizer
  10.     {
  11.         // The regex is carefully constructed as is, so if change needed: be very wary of side-effects.
  12.         private string defaultSentenceSplit = @"(?<! adm| adr| afd| alm| ang| bl|\.a| bla| opg| org| dr| dvs| eks| fx| fhv| hhv| i\.e| ifl| ifb| hr| frk| jf| jvf|www| m| mio| ml| mr| mrs| pga)\.(?!\.|[0-9]|com|eks|dk|e\.|\?|\!)|\?(?!\?|\.|\!)|\!(?!\!|\.|\?)";
  13.  
  14.         //default regex constructor
  15.         public Tokenizer()
  16.         {
  17.         }
  18.  
  19.         //constructor that makes user able to change default sentence splitter with own regex
  20.         public Tokenizer(string regexSentenceSplit)
  21.         {
  22.            defaultSentenceSplit = regexSentenceSplit;
  23.         }
  24.  
  25.         /*
  26.          * Takes a list of raw comments and converts them into a list of TokenizedComments
  27.          * Param: List of raw comments
  28.          * Returns: List<TokenizedComment>
  29.         */
  30.  
  31.         public List<TokenizedComment> TokenizeComments(List<string> comments)
  32.         {
  33.             var tokenizedComments = new List<TokenizedComment>();
  34.  
  35.             foreach (string comment in comments) {
  36.                 tokenizedComments.Add(TokenizeComment(comment));
  37.             }
  38.  
  39.             return tokenizedComments;
  40.         }
  41.  
  42.         /*
  43.             Takes a raw comment and splits it into sentences, which are then tokenized
  44.             Param: A raw comment
  45.             Returns: TokenizedComment
  46.         */
  47.  
  48.         public TokenizedComment TokenizeComment(string comment)
  49.         {
  50.             var sentences = SplitComment(comment);
  51.             var tokenComment = new List<List<string>>();
  52.  
  53.             foreach (string sentence in sentences){
  54.                 var tokens = TokenizeSentence(sentence);
  55.                 tokenComment.Add(tokens);
  56.             }
  57.  
  58.             return new TokenizedComment(tokenComment);
  59.         }
  60.  
  61.         /*
  62.          * Splits a sentence into tokens and filters out empty
  63.         */
  64.         private List<string> TokenizeSentence(string sentence)
  65.         {
  66.             var tokens = new List<string>(sentence.Split(" "));
  67.             tokens = tokens.Where(x => x != "").ToList<string>();
  68.  
  69.             return tokens;
  70.  
  71.         }
  72.  
  73.         /*
  74.          Takes a string and seperates at [.?!] and tries to avoid splitting on popular abbreviations
  75.          */
  76.         private List<string> SplitComment(string s)
  77.         {
  78.             var list = new List<string>();
  79.  
  80.             //replaces newlines with a punct aswell as adds a punct if comment does not have end punct.
  81.             s = s.Replace(Environment.NewLine, ".").ToLower() + ".";
  82.  
  83.             var sentences = Regex.Split(s, @defaultSentenceSplit);
  84.  
  85.             foreach (string sentence in sentences)
  86.                 list.Add(RemoveNonLetters(sentence).Trim());
  87.  
  88.             return list;
  89.         }
  90.  
  91.         /*
  92.          *  Removes non-letters
  93.          *  Side-effect: Removes !? and emojies
  94.         */
  95.  
  96.         private string RemoveNonLetters(string s)
  97.         {
  98.             var rgx = new Regex(@"[^a-zæøå\s:]");
  99.  
  100.             s = rgx.Replace(s,"");
  101.  
  102.             return s;
  103.         }
  104.  
  105.     }
  106. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement