Advertisement
Guest User

Untitled

a guest
Dec 22nd, 2012
38
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.91 KB | None | 0 0
  1. public static string fix_encoding(string src)
  2. {
  3. StringWriter return_str = new StringWriter();
  4. byte[] byte_array = Encoding.ASCII.GetBytes(src.Substring(0, src.Length));
  5. int len = byte_array.Length;
  6. byte byt;
  7. for(var i=0; i<len; i+=1)
  8. {
  9. byt = byte_array[i];
  10. if (byt == 63)
  11. {
  12. return_str.Write(" ");
  13. }
  14. else
  15. {
  16. return_str.Write(Encoding.ASCII.GetString(byte_array, i, 1));
  17. }
  18. }
  19. return return_str.ToString();
  20. }
  21.  
  22. StringWriter output = new StringWriter();
  23. output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, page, new SimpleTextExtractionStrategy()));
  24. currentText = fix_encoding(output.ToString());
  25.  
  26. public static string remove_non_ascii(string src)
  27. {
  28. return Regex.Replace(src, @"[^u0000-u007F]", " ");
  29. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement