Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public class MailBodyParser
- {
- //Two patterns / languages will be supported in this first Version (EN/DE)
- public static final String[] FROM = {"Von", "From"};
- public static final String[] TO = {"An", "To"};
- public static final String[] SENT = {"Gesendet", "Sent"};
- public static final String[] SUBJECT = {"Betreff", "Subject"};
- public static final String[] CC = {"Cc", "Cc"};
- public static final String[] IMPORTANCE = {"Wichtigkeit","Importance"};
- public static MailItemBody parseBody( MailBody body )
- {
- String plainText = body.getBodyPlain();
- String regex = buildPlainRegEx();
- String[] bodyParts = plainText.split(regex);
- //RegEx for fixing image tags
- String regexImage = "IMAGE (.*?)>";
- String regexLink = "HYPERLINK \"(.*?)\"([^\\s]*)";
- String regexContentId = "cid:(.*)";
- Pattern pattern = Pattern.compile(regexImage);
- Pattern contentIdPattern = Pattern.compile(regexContentId);
- Pattern hyperlinkPattern = Pattern.compile(regexLink);
- Matcher matcher;
- //Trim the parts and replace things
- for (int i = 0; i < bodyParts.length; i++) {
- //Check if outlook mail or not ==> apply with outlook mails, else
- if( body.getBodyHtml().contains("urn:schemas-microsoft-com:office:word") )
- {
- //Paragraph
- bodyParts[i] = bodyParts[i].replaceAll("\r\n\r\n \r\n\r\n", "\n\n");
- //New Lines
- bodyParts[i] = bodyParts[i].replaceAll("\r\n\r\n", "\n");
- bodyParts[i] = bodyParts[i].replaceAll("\r\n", "\n");
- bodyParts[i] = bodyParts[i].trim();
- }
- else
- {
- bodyParts[i] = bodyParts[i].replaceAll("\r\n", "\n");
- bodyParts[i] = bodyParts[i].trim();
- }
- //Find images and fix indices
- matcher = pattern.matcher(bodyParts[i]);
- while( matcher.find() )
- {
- String replaceWhat = matcher.group(0);
- String replaceWith = null;
- String attributes = matcher.group(1);
- String[] singleAttributes = attributes.split(" ");
- for (int j = 0; j < singleAttributes.length; j++) {
- if( singleAttributes[j].startsWith("src=") )
- {
- Matcher m = contentIdPattern.matcher(singleAttributes[j]);
- if( m.find() )
- {
- String match = m.group(1);
- //Can occur if src is surrounded by ""
- if( match.endsWith("\"") )
- match = match.replace("\"", "");
- replaceWith = "IMAGE<!" + getAttachmentId( body, match ) + "!>";
- }
- }
- }
- if( replaceWith == null )
- replaceWith = "--Image could not be restored from mail--";
- bodyParts[i] = bodyParts[i].replace(replaceWhat, replaceWith);
- }
- matcher = hyperlinkPattern.matcher(bodyParts[i]);
- while( matcher.find() )
- {
- String replaceWith = "HYPERLINK<!" + matcher.group(1) + "||" + matcher.group(2) + "!>";
- bodyParts[i] = bodyParts[i].replace(matcher.group(0), replaceWith);
- }
- }
- //anonymize the html body
- String htmlText = body.getBodyHtml();
- //Loop over text and find the parts to replace
- pattern = Pattern.compile(buildHtmlRegEx());
- matcher = pattern.matcher(htmlText);
- /*
- * TODO: ADMIN OPTION
- while(matcher.find())
- {
- String from = matcher.group(1);
- String to = matcher.group(2);
- String cc = null;
- if( matcher.groupCount() > 2 )
- cc = matcher.group(3);
- htmlText = htmlText.replaceAll(from, "n/a");
- htmlText = htmlText.replaceAll(to, "n/a");
- if( cc != null )
- htmlText = htmlText.replaceAll(cc, "n/a");
- }
- */
- MailItemBody ret = new MailItemBody();
- ret.setAttachmentsIds(toIntArray(body.getAttachmentIds()));
- ret.setBodyParts(bodyParts);
- ret.setAnonymizedHtmlBody(htmlText);
- return ret;
- }
- private static String getAttachmentId( MailBody body, String contentId )
- {
- String[] contentIds = body.getAttachmentContentIds();
- for( int i = 0; i < contentIds.length; i++ )
- {
- if( contentId.equals(contentIds[i]) )
- return body.getAttachmentIds()[i];
- }
- return null;
- }
- private static int[] toIntArray( String[] array )
- {
- if( array == null )
- return new int[0];
- int[] ret = new int[array.length];
- for (int i = 0; i < ret.length; i++) {
- ret[i] = Integer.parseInt(array[i]);
- }
- return ret;
- }
- private static String buildPlainRegEx()
- {
- //Example: (Von|From):.*?\n(Gesendet|Sent):.*?\n(An|To):.*?\n(Betreff|Subject):.*?\n
- String regex;
- String fromTerm = "(?:";
- String sentTerm = "(?:";
- String toTerm = "(?:";
- String ccTerm = "(?:(?:";
- String subjectTerm = "(";
- String importanceTerm = "(?:(?:";
- String sep = ":.*?(?:\r)?\n";
- for (int i = 0; i < FROM.length; i++)
- {
- fromTerm += FROM[i] + ( i < (FROM.length - 1) ? "|":")" );
- sentTerm += SENT[i] + ( i < (SENT.length - 1) ? "|":")" );
- toTerm += TO[i] + ( i < (TO.length - 1) ? "|":")" );
- ccTerm += CC[i] + ( i < (CC.length - 1) ? "|":"):.*?(?:\r)?\n)?" );
- subjectTerm += SUBJECT[i] + ( i < (SUBJECT.length - 1) ? "|":")" );
- importanceTerm += IMPORTANCE[i] + ( i < (IMPORTANCE.length - 1) ? "|":"):.*?(?:\r)?\n)?" );
- }
- regex = "(?s)"+ fromTerm + sep + sentTerm + sep + toTerm + sep + ccTerm + subjectTerm + sep + importanceTerm;
- return regex;
- }
- private static String buildHtmlRegEx()
- {
- //(?s)(Von|From):.*?(?:<.*?>)*(.*?)(?:<.*?>)*(Gesendet|Sent).*?(An|To):(?:<.*?>)*(.*?)<.*?>
- String regex;
- String fromTerm = "(?:";
- String sentTerm = "(?:";
- String toTerm = "(?:";
- String ccTerm = "(?:(?:";
- String subjectTerm = "(?:";
- String dotAllFlag = "(?s)";
- for (int i = 0; i < FROM.length; i++)
- {
- fromTerm += FROM[i] + ( i < (FROM.length - 1) ? "|":")" );
- sentTerm += SENT[i] + ( i < (SENT.length - 1) ? "|":")" );
- toTerm += TO[i] + ( i < (TO.length - 1) ? "|":")" );
- subjectTerm += SUBJECT[i] + ( i < (SUBJECT.length - 1) ? "|":")" );
- ccTerm += CC[i] + ( i < (CC.length - 1) ? "|":"):(?:<.*?>)*(.*?)<.*?>)?" );
- }
- regex = dotAllFlag + fromTerm + ":(?:<.*?>)+(.*?)(?:<.*?>)+" + sentTerm +
- ":.*?" + toTerm + ":(?:<.*?>)+(.*?)(?:<.*?>)+" + ccTerm + ".*?" + subjectTerm;
- return regex;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement