Advertisement
Guest User

Untitled

a guest
Sep 23rd, 2014
250
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 7.25 KB | None | 0 0
  1. public class MailBodyParser
  2. {
  3.     //Two patterns / languages will be supported in this first Version (EN/DE)
  4.     public static final String[] FROM = {"Von", "From"};
  5.     public static final String[] TO = {"An", "To"};
  6.     public static final String[] SENT = {"Gesendet", "Sent"};
  7.     public static final String[] SUBJECT = {"Betreff", "Subject"};
  8.     public static final String[] CC = {"Cc", "Cc"};
  9.     public static final String[] IMPORTANCE = {"Wichtigkeit","Importance"};
  10.  
  11.     public static MailItemBody parseBody( MailBody body )
  12.     {
  13.         String plainText = body.getBodyPlain();
  14.         String regex = buildPlainRegEx();
  15.         String[] bodyParts = plainText.split(regex);
  16.  
  17.         //RegEx for fixing image tags
  18.         String regexImage = "IMAGE (.*?)>";
  19.         String regexLink = "HYPERLINK \"(.*?)\"([^\\s]*)";
  20.         String regexContentId = "cid:(.*)";
  21.         Pattern pattern = Pattern.compile(regexImage);
  22.         Pattern contentIdPattern = Pattern.compile(regexContentId);
  23.         Pattern hyperlinkPattern = Pattern.compile(regexLink);
  24.         Matcher matcher;
  25.  
  26.         //Trim the parts and replace things
  27.         for (int i = 0; i < bodyParts.length; i++) {
  28.  
  29.             //Check if outlook mail or not ==> apply with outlook mails, else
  30.             if( body.getBodyHtml().contains("urn:schemas-microsoft-com:office:word") )
  31.             {
  32.                 //Paragraph
  33.                 bodyParts[i] = bodyParts[i].replaceAll("\r\n\r\n \r\n\r\n", "\n\n");
  34.                 //New Lines
  35.                 bodyParts[i] = bodyParts[i].replaceAll("\r\n\r\n", "\n");
  36.  
  37.                 bodyParts[i] = bodyParts[i].replaceAll("\r\n", "\n");
  38.                 bodyParts[i] = bodyParts[i].trim();
  39.             }
  40.             else
  41.             {
  42.                 bodyParts[i] = bodyParts[i].replaceAll("\r\n", "\n");
  43.                 bodyParts[i] = bodyParts[i].trim();
  44.             }
  45.  
  46.             //Find images and fix indices
  47.             matcher = pattern.matcher(bodyParts[i]);
  48.             while( matcher.find() )
  49.             {
  50.                 String replaceWhat = matcher.group(0);
  51.                 String replaceWith = null;
  52.                 String attributes = matcher.group(1);
  53.                 String[] singleAttributes = attributes.split(" ");
  54.  
  55.                 for (int j = 0; j < singleAttributes.length; j++) {
  56.                     if( singleAttributes[j].startsWith("src=") )
  57.                     {
  58.                         Matcher m = contentIdPattern.matcher(singleAttributes[j]);
  59.                         if( m.find() )
  60.                         {
  61.                             String match = m.group(1);
  62.                             //Can occur if src is surrounded by ""
  63.                             if( match.endsWith("\"") )
  64.                                 match = match.replace("\"", "");
  65.  
  66.                             replaceWith = "IMAGE<!" + getAttachmentId( body, match ) + "!>";
  67.                         }
  68.                     }
  69.                 }
  70.  
  71.                 if( replaceWith == null )
  72.                     replaceWith = "--Image could not be restored from mail--";
  73.  
  74.                 bodyParts[i] = bodyParts[i].replace(replaceWhat, replaceWith);
  75.             }
  76.  
  77.             matcher = hyperlinkPattern.matcher(bodyParts[i]);
  78.             while( matcher.find() )
  79.             {
  80.                 String replaceWith = "HYPERLINK<!" + matcher.group(1) + "||" + matcher.group(2) + "!>";
  81.                 bodyParts[i] = bodyParts[i].replace(matcher.group(0), replaceWith);
  82.             }
  83.         }
  84.  
  85.         //anonymize the html body
  86.         String htmlText = body.getBodyHtml();
  87.  
  88.         //Loop over text and find the parts to replace
  89.         pattern = Pattern.compile(buildHtmlRegEx());
  90.         matcher = pattern.matcher(htmlText);
  91.  
  92.         /*
  93.          * TODO: ADMIN OPTION
  94.         while(matcher.find())
  95.         {
  96.             String from = matcher.group(1);
  97.             String to   = matcher.group(2);
  98.             String cc   = null;
  99.             if( matcher.groupCount() > 2 )
  100.                 cc = matcher.group(3);
  101.  
  102.             htmlText = htmlText.replaceAll(from, "n/a");
  103.             htmlText = htmlText.replaceAll(to, "n/a");
  104.             if( cc != null )
  105.                 htmlText = htmlText.replaceAll(cc, "n/a");
  106.         }
  107.          */
  108.         MailItemBody ret = new MailItemBody();
  109.         ret.setAttachmentsIds(toIntArray(body.getAttachmentIds()));
  110.         ret.setBodyParts(bodyParts);
  111.         ret.setAnonymizedHtmlBody(htmlText);
  112.         return ret;
  113.     }
  114.  
  115.     private static String getAttachmentId( MailBody body, String contentId )
  116.     {
  117.         String[] contentIds = body.getAttachmentContentIds();
  118.         for( int i = 0; i < contentIds.length; i++ )
  119.         {
  120.             if( contentId.equals(contentIds[i]) )
  121.                 return body.getAttachmentIds()[i];
  122.         }
  123.  
  124.         return null;
  125.     }
  126.  
  127.     private static int[] toIntArray( String[] array )
  128.     {
  129.         if( array == null )
  130.             return new int[0];
  131.        
  132.         int[] ret = new int[array.length];
  133.         for (int i = 0; i < ret.length; i++) {
  134.             ret[i] = Integer.parseInt(array[i]);
  135.         }
  136.         return ret;
  137.     }
  138.  
  139.     private static String buildPlainRegEx()
  140.     {
  141.         //Example: (Von|From):.*?\n(Gesendet|Sent):.*?\n(An|To):.*?\n(Betreff|Subject):.*?\n
  142.         String regex;
  143.         String fromTerm = "(?:";
  144.         String sentTerm = "(?:";
  145.         String toTerm = "(?:";
  146.         String ccTerm = "(?:(?:";
  147.         String subjectTerm = "(";
  148.         String importanceTerm = "(?:(?:";
  149.         String sep = ":.*?(?:\r)?\n";
  150.  
  151.         for (int i = 0; i < FROM.length; i++)
  152.         {
  153.             fromTerm += FROM[i] + ( i < (FROM.length - 1) ? "|":")" );
  154.             sentTerm += SENT[i] + ( i < (SENT.length - 1) ? "|":")" );
  155.             toTerm += TO[i] + ( i < (TO.length - 1) ? "|":")" );
  156.             ccTerm += CC[i] + ( i < (CC.length - 1) ? "|":"):.*?(?:\r)?\n)?" );
  157.             subjectTerm += SUBJECT[i] + ( i < (SUBJECT.length - 1) ? "|":")" );
  158.             importanceTerm += IMPORTANCE[i] + ( i < (IMPORTANCE.length - 1) ? "|":"):.*?(?:\r)?\n)?" );
  159.         }
  160.        
  161.         regex = "(?s)"+ fromTerm + sep + sentTerm + sep + toTerm + sep + ccTerm + subjectTerm + sep + importanceTerm;
  162.         return regex;
  163.     }
  164.  
  165.  
  166.     private static String buildHtmlRegEx()
  167.     {
  168.         //(?s)(Von|From):.*?(?:<.*?>)*(.*?)(?:<.*?>)*(Gesendet|Sent).*?(An|To):(?:<.*?>)*(.*?)<.*?>
  169.         String regex;
  170.         String fromTerm = "(?:";
  171.         String sentTerm = "(?:";
  172.         String toTerm = "(?:";
  173.         String ccTerm = "(?:(?:";
  174.         String subjectTerm = "(?:";
  175.         String dotAllFlag = "(?s)";
  176.  
  177.         for (int i = 0; i < FROM.length; i++)
  178.         {
  179.             fromTerm += FROM[i] + ( i < (FROM.length - 1) ? "|":")" );
  180.             sentTerm += SENT[i] + ( i < (SENT.length - 1) ? "|":")" );
  181.             toTerm += TO[i] + ( i < (TO.length - 1) ? "|":")" );
  182.             subjectTerm += SUBJECT[i] + ( i < (SUBJECT.length - 1) ? "|":")" );
  183.             ccTerm += CC[i] + ( i < (CC.length - 1) ? "|":"):(?:<.*?>)*(.*?)<.*?>)?" );
  184.         }
  185.  
  186.         regex = dotAllFlag + fromTerm + ":(?:<.*?>)+(.*?)(?:<.*?>)+" + sentTerm +
  187.                 ":.*?" + toTerm + ":(?:<.*?>)+(.*?)(?:<.*?>)+" + ccTerm + ".*?" + subjectTerm;
  188.         return regex;
  189.     }
  190. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement