PatternRule.java

/**
 * Standard implementation of <code>IPredicateRule</code>.
 * Is is capable of detecting a pattern which begins with a given start
 * sequence and ends with a given end sequence. If the end sequence is
 * not specified, it can be either end of line, end or file, or both. Additionally,
 * the pattern can be constrained to begin in a certain column. The rule can also
 * be used to check whether the text to scan covers half of the pattern, i.e. contains
 * the end sequence required by the rule.
 */
public class PatternRule implements IPredicateRule {

    /**
     * Comparator that orders <code>char[]</code> in decreasing array lengths.
     *
     * @since 3.1
     */
    private static class DecreasingCharArrayLengthComparator implements Comparator<char[]> {
        @Override
        public int compare(char[] o1, char[] o2) {
            return o2.length - o1.length;
        }
    }

    /** Internal setting for the un-initialized column constraint */
    protected static final int UNDEFINED= -1;

    /** The token to be returned on success */
    protected IToken fToken;
    /** The pattern's start sequence */
    protected char[] fStartSequence;
    /** The pattern's end sequence */
    protected char[] fEndSequence;
    /** The pattern's column constrain */
    protected int fColumn= UNDEFINED;
    /** The pattern's escape character */
    protected char fEscapeCharacter;
    /**
     * Indicates whether the escape character continues a line
     * @since 3.0
     */
    protected boolean fEscapeContinuesLine;
    /** Indicates whether end of line terminates the pattern */
    protected boolean fBreaksOnEOL;
    /** Indicates whether end of file terminates the pattern */
    protected boolean fBreaksOnEOF;

    /**
     * Line delimiter comparator which orders according to decreasing delimiter length.
     * @since 3.1
     */
    private Comparator<char[]> fLineDelimiterComparator= new DecreasingCharArrayLengthComparator();
    /**
     * Cached line delimiters.
     * @since 3.1
     */
    private char[][] fLineDelimiters;
    /**
     * Cached sorted {@linkplain #fLineDelimiters}.
     * @since 3.1
     */
    private char[][] fSortedLineDelimiters;

    /**
     * Creates a rule for the given starting and ending sequence.
     * When these sequences are detected the rule will return the specified token.
     * Alternatively, the sequence can also be ended by the end of the line.
     * Any character which follows the given escapeCharacter will be ignored.
     *
     * @param startSequence the pattern's start sequence
     * @param endSequence the pattern's end sequence, <code>null</code> is a legal value
     * @param token the token which will be returned on success
     * @param escapeCharacter any character following this one will be ignored
     * @param breaksOnEOL indicates whether the end of the line also terminates the pattern
     */
    public PatternRule(String startSequence, String endSequence, IToken token, char escapeCharacter, boolean breaksOnEOL) {
        Assert.isTrue(startSequence != null && !startSequence.isEmpty());
        Assert.isTrue(endSequence != null || breaksOnEOL);
        Assert.isNotNull(token);

        fStartSequence= startSequence.toCharArray();
        fEndSequence= (endSequence == null ? new char[0] : endSequence.toCharArray());
        fToken= token;
        fEscapeCharacter= escapeCharacter;
        fBreaksOnEOL= breaksOnEOL;
    }

    /**
     * Creates a rule for the given starting and ending sequence.
     * When these sequences are detected the rule will return the specified token.
     * Alternatively, the sequence can also be ended by the end of the line or the end of the file.
     * Any character which follows the given escapeCharacter will be ignored.
     *
     * @param startSequence the pattern's start sequence
     * @param endSequence the pattern's end sequence, <code>null</code> is a legal value
     * @param token the token which will be returned on success
     * @param escapeCharacter any character following this one will be ignored
     * @param breaksOnEOL indicates whether the end of the line also terminates the pattern
     * @param breaksOnEOF indicates whether the end of the file also terminates the pattern
     * @since 2.1
     */
    public PatternRule(String startSequence, String endSequence, IToken token, char escapeCharacter, boolean breaksOnEOL, boolean breaksOnEOF) {
        this(startSequence, endSequence, token, escapeCharacter, breaksOnEOL);
        fBreaksOnEOF= breaksOnEOF;
    }

    /**
     * Creates a rule for the given starting and ending sequence.
     * When these sequences are detected the rule will return the specified token.
     * Alternatively, the sequence can also be ended by the end of the line or the end of the file.
     * Any character which follows the given escapeCharacter will be ignored. An end of line
     * immediately after the given <code>lineContinuationCharacter</code> will not cause the
     * pattern to terminate even if <code>breakOnEOL</code> is set to true.
     *
     * @param startSequence the pattern's start sequence
     * @param endSequence the pattern's end sequence, <code>null</code> is a legal value
     * @param token the token which will be returned on success
     * @param escapeCharacter any character following this one will be ignored
     * @param breaksOnEOL indicates whether the end of the line also terminates the pattern
     * @param breaksOnEOF indicates whether the end of the file also terminates the pattern
     * @param escapeContinuesLine indicates whether the specified escape character is used for line
     *        continuation, so that an end of line immediately after the escape character does not
     *        terminate the pattern, even if <code>breakOnEOL</code> is set
     * @since 3.0
     */
    public PatternRule(String startSequence, String endSequence, IToken token, char escapeCharacter, boolean breaksOnEOL, boolean breaksOnEOF, boolean escapeContinuesLine) {
        this(startSequence, endSequence, token, escapeCharacter, breaksOnEOL, breaksOnEOF);
        fEscapeContinuesLine= escapeContinuesLine;
    }

    /**
     * Sets a column constraint for this rule. If set, the rule's token
     * will only be returned if the pattern is detected starting at the
     * specified column. If the column is smaller then 0, the column
     * constraint is considered removed.
     *
     * @param column the column in which the pattern starts
     */
    public void setColumnConstraint(int column) {
        if (column < 0)
            column= UNDEFINED;
        fColumn= column;
    }


    /**
     * Evaluates this rules without considering any column constraints.
     *
     * @param scanner the character scanner to be used
     * @return the token resulting from this evaluation
     */
    protected IToken doEvaluate(ICharacterScanner scanner) {
        return doEvaluate(scanner, false);
    }

    /**
     * Evaluates this rules without considering any column constraints. Resumes
     * detection, i.e. look sonly for the end sequence required by this rule if the
     * <code>resume</code> flag is set.
     *
     * @param scanner the character scanner to be used
     * @param resume <code>true</code> if detection should be resumed, <code>false</code> otherwise
     * @return the token resulting from this evaluation
     * @since 2.0
     */
    protected IToken doEvaluate(ICharacterScanner scanner, boolean resume) {

        if (resume) {

            if (endSequenceDetected(scanner))
                return fToken;

        } else {

            int c= scanner.read();
            if (c == fStartSequence[0]) {
                if (sequenceDetected(scanner, fStartSequence, false)) {
                    if (endSequenceDetected(scanner))
                        return fToken;
                }
            }
        }

        scanner.unread();
        return Token.UNDEFINED;
    }

    @Override
    public IToken evaluate(ICharacterScanner scanner) {
        return evaluate(scanner, false);
    }

    /**
     * Returns whether the end sequence was detected. As the pattern can be considered
     * ended by a line delimiter, the result of this method is <code>true</code> if the
     * rule breaks on the end of the line, or if the EOF character is read.
     *
     * @param scanner the character scanner to be used
     * @return <code>true</code> if the end sequence has been detected
     */
    protected boolean endSequenceDetected(ICharacterScanner scanner) {

        char[][] originalDelimiters= scanner.getLegalLineDelimiters();
        int count= originalDelimiters.length;
        if (fLineDelimiters == null || fLineDelimiters.length != count) {
            fSortedLineDelimiters= new char[count][];
        } else {
            while (count > 0 && Arrays.equals(fLineDelimiters[count - 1], originalDelimiters[count - 1]))
                count--;
        }
        if (count != 0) {
            fLineDelimiters= originalDelimiters;
            System.arraycopy(fLineDelimiters, 0, fSortedLineDelimiters, 0, fLineDelimiters.length);
            Arrays.sort(fSortedLineDelimiters, fLineDelimiterComparator);
        }

        int readCount= 1;
        int c;
        while ((c= scanner.read()) != ICharacterScanner.EOF) {
            if (c == fEscapeCharacter) {
                // Skip escaped character(s)
                if (fEscapeContinuesLine) {
                    c= scanner.read();
                    for (char[] fSortedLineDelimiter : fSortedLineDelimiters) {
                        if (c == fSortedLineDelimiter[0] && sequenceDetected(scanner, fSortedLineDelimiter, fBreaksOnEOF))
                            break;
                    }
                } else
                    scanner.read();

            } else if (fEndSequence.length > 0 && c == fEndSequence[0]) {
                // Check if the specified end sequence has been found.
                if (sequenceDetected(scanner, fEndSequence, fBreaksOnEOF))
                    return true;
            } else if (fBreaksOnEOL) {
                // Check for end of line since it can be used to terminate the pattern.
                for (char[] fSortedLineDelimiter : fSortedLineDelimiters) {
                    if (c == fSortedLineDelimiter[0] && sequenceDetected(scanner, fSortedLineDelimiter, fBreaksOnEOF))
                        return true;
                }
            }
            readCount++;
        }

        if (fBreaksOnEOF)
            return true;

        for (; readCount > 0; readCount--)
            scanner.unread();

        return false;
    }

    /**
     * Returns whether the next characters to be read by the character scanner
     * are an exact match with the given sequence. No escape characters are allowed
     * within the sequence. If specified the sequence is considered to be found
     * when reading the EOF character.
     *
     * @param scanner the character scanner to be used
     * @param sequence the sequence to be detected
     * @param eofAllowed indicated whether EOF terminates the pattern
     * @return <code>true</code> if the given sequence has been detected
     */
    protected boolean sequenceDetected(ICharacterScanner scanner, char[] sequence, boolean eofAllowed) {
        for (int i= 1; i < sequence.length; i++) {
            int c= scanner.read();
            if (c == ICharacterScanner.EOF && eofAllowed) {
                return true;
            } else if (c != sequence[i]) {
                // Non-matching character detected, rewind the scanner back to the start.
                // Do not unread the first character.
                scanner.unread();
                for (int j= i-1; j > 0; j--)
                    scanner.unread();
                return false;
            }
        }

        return true;
    }

    @Override
    public IToken evaluate(ICharacterScanner scanner, boolean resume) {
        if (fColumn == UNDEFINED)
            return doEvaluate(scanner, resume);

        int c= scanner.read();
        scanner.unread();
        if (c == fStartSequence[0])
            return (fColumn == scanner.getColumn() ? doEvaluate(scanner, resume) : Token.UNDEFINED);
        return Token.UNDEFINED;
    }

    @Override
    public IToken getSuccessToken() {
        return fToken;
    }
}