ivandrofly

Text segmenter using Functional programming in C#

Sep 2nd, 2023
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 5.37 KB | None | 0 0
  1. using System.Collections.Immutable;
  2. using System.Text;
  3. using Text = System.ReadOnlyMemory<char>;
  4.  
  5. namespace TextSegmenter;
  6.  
  7. public class UnitTest1
  8. {
  9.     public delegate (ImmutableArray<Segment>, ReadOnlyMemory<char>)? SegmentParser(Text text);
  10.  
  11.     [Theory]
  12.     [InlineData("{tag}foobar")]
  13.     public void SegmentTest(string input)
  14.     {
  15.         var result = Parse(input);
  16.         // Span<char> span;
  17.         Assert.Equal(2, result.Count());
  18.     }
  19.    
  20.     [Theory]
  21.     [InlineData("<i>foo bar</i>")]
  22.     public void SegmentTestSeveral(string input)
  23.     {
  24.         var result = Parse(input);
  25.         // Span<char> span;
  26.         Assert.Equal(3, result.Count());
  27.     }
  28.  
  29.     [Theory]
  30.     [InlineData("- <font color=\"#0000ff\">Yi's Mom</font>: Hey, kids...\n- <font color=\"#800000\">(BOTH GASP)</font>")]
  31.     public void SegmentTestComplex(string input)
  32.     {
  33.         var result = Parse(input);
  34.         // Span<char> span;
  35.         Assert.Contains(result, segment => segment is TagOpen);
  36.     }
  37.  
  38.     private static readonly SegmentParser ParseAssTag = AssParser();
  39.     private static readonly SegmentParser ParseOpenTag = OpenTagParser();
  40.     private static readonly SegmentParser ParseClose = CloseTagParser();
  41.     private static readonly SegmentParser ParseText = TextParser();
  42.  
  43.     public IEnumerable<Segment> Parse(string input)
  44.     {
  45.         SegmentParser segmentParser = Multiple(ParseAssTag, ParseOpenTag, ParseClose, ParseText);
  46.  
  47.         // transform the result
  48.         return segmentParser(input.AsMemory()) switch
  49.         {
  50.             ({ Length: > 0 } segs, _) => segs.Select(seg => seg),
  51.             _ => new List<Segment>(),
  52.         };
  53.     }
  54.  
  55.     // note the values can be passed to be the exact match
  56.     private static SegmentParser AssParser()
  57.     {
  58.         return input => input.Length > 3 && input.Span.StartsWith("{")
  59.         // todo: fix "}" it's possible for it not be present
  60.             ? (ImmutableArray<Segment>.Empty.Add(new TagOpen(input[..(input.Span.IndexOf('}') + 1)])), input[(input.Span.IndexOf('}') + 1)..])
  61.             : null;
  62.     }
  63.  
  64.     private static SegmentParser OpenTagParser()
  65.     {
  66.         return text => (text.Length > 3 &&
  67.                         (text.Span.StartsWith("<i>", StringComparison.Ordinal) ||
  68.                          text.Span.StartsWith("<b>", StringComparison.Ordinal) ||
  69.                          text.Span.StartsWith("<font ", StringComparison.Ordinal) ||
  70.                          text.Span.StartsWith("<u>", StringComparison.Ordinal)))
  71.             ? (ImmutableArray<Segment>.Empty.Add(new TagOpen(text[..(text.Span.IndexOf('>') + 1)])), text[(text.Span.IndexOf('>') + 1)..])
  72.             : null;
  73.     }
  74.  
  75.     private static SegmentParser CloseTagParser()
  76.     {
  77.         return text => (text.Length > 3 &&
  78.                         (text.Span.StartsWith("</i>", StringComparison.Ordinal) ||
  79.                          text.Span.StartsWith("</b>", StringComparison.Ordinal) ||
  80.                          text.Span.StartsWith("</font>", StringComparison.Ordinal) ||
  81.                          text.Span.StartsWith("</u>", StringComparison.Ordinal)))
  82.             ? (ImmutableArray<Segment>.Empty.Add(new TagClose(text[..(text.Span.IndexOf('>') + 1)])), text[(text.Span.IndexOf('>') + 1)..])
  83.             : null;
  84.     }
  85.  
  86.     private static SegmentParser TextParser()
  87.     {
  88.         // get char while != < or {
  89.         // use string builder to accumulate vlaue
  90.         return text => AggregateChar(new StringBuilder(), text) switch
  91.         {
  92.             ({ Length: > 0 } s, var rest) => (ImmutableArray<Segment>.Empty.Add(new TextSegment(s.AsMemory())), rest),
  93.             _ => null
  94.         };
  95.  
  96.         (string, ReadOnlyMemory<char>) AggregateChar(StringBuilder sb, Text rest) =>
  97.             (rest.Length > 0 && rest.Span[0] != '{' && rest.Span[0] != '<') switch
  98.             {
  99.                 true => AggregateChar(sb.Append(rest.Span[0]), rest[1..]),
  100.                 false => (sb.ToString(), rest)
  101.             };
  102.     }
  103.    
  104.     // ReSharper disable once IdentifierTypo
  105.     private static SegmentParser Multiple(params SegmentParser[] segmentParsers)
  106.     {
  107.         return text => AggregateValue(ImmutableArray<Segment>.Empty, text); // todo: switch against this?
  108.  
  109.         (ImmutableArray<Segment>, ReadOnlyMemory<char>)? AggregateValue(ImmutableArray<Segment> accumulator, Text text) => text.Length > 0
  110.             // try parsing using one of the parser, as soon as one of them return a value start all over again
  111.             ? segmentParsers.Select(parser => parser(text)).FirstOrDefault(result => result != null) switch
  112.             {
  113.                 ({ Length: > 0 } segments, var rest) => AggregateValue(accumulator.AddRange(segments), rest),
  114.                 // var (result, rest)  =>  null,
  115.                 // _ => (accumulator, text)
  116.                 _ => (accumulator, text)
  117.             }
  118.             : (accumulator, text);
  119.     }
  120.  
  121.     public abstract record Segment(ReadOnlyMemory<char> Content);
  122.  
  123.     public record TextSegment(ReadOnlyMemory<char> Content) : Segment(Content);
  124.  
  125.     public record TagOpen(ReadOnlyMemory<char> Content) : Segment(Content);
  126.  
  127.     public record TagClose(ReadOnlyMemory<char> Content) : Segment(Content);
  128.  
  129.     public record Symbol(ReadOnlyMemory<char> Content) : Segment(Content);
  130.  
  131.     public record HearingImpaired(ReadOnlyMemory<char> Content, ImmutableArray<Segment> Segments) : Segment(Content);
  132. }
Add Comment
Please, Sign In to add comment