Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package stackoverflow;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class Tokenizer {
- private static class Pat {
- final Pattern pattern;
- final int type;
- public Pat(Pattern pattern, int type) {
- this.pattern = pattern;
- this.type = type;
- }
- }
- private final List<Pat> patterns = new ArrayList<>();
- public static final int EOS = -1;
- public static final int ANY = -2;
- public static final int UNKNOWN = -3;
- private Pat skip = new Pat(Pattern.compile("^\\s*"), 0);
- private static final Pat any = new Pat(Pattern.compile("."), ANY);
- public String string;
- public int index;
- private Pattern compile(String s) {
- if (!s.startsWith("^"))
- s = "^" + s;
- return Pattern.compile(s);
- }
- public Tokenizer tokenPattern(Pattern p, int type) {
- if (type < 0)
- throw new IllegalArgumentException("type(" + type + ") must be >= 0");
- patterns.add(new Pat(p, type));
- return this;
- }
- public Tokenizer tokenPattern(String s, int type) {
- return tokenPattern(compile(s), type);
- }
- public Tokenizer tokenConstant(String s, int type) {
- return tokenPattern(Pattern.quote(s), type);
- }
- private Tokenizer skipPattern(Pattern p) {
- skip = new Pat(p, 0);
- return this;
- }
- public Tokenizer skipPattern(String s) {
- return skipPattern(compile(s));
- }
- public Tokenizer source(String string) {
- this.string = string;
- this.index = 0;
- return this;
- }
- private int start;
- public int start() { return start; }
- private int end;
- public int end() { return end; }
- private int type;
- public int type() { return type; }
- private String value;
- public String value() { return value; }
- public boolean hasNext() {
- return index < string.length();
- }
- private int set(int type, int start, int end, String value) {
- this.type = type;
- this.start = start;
- this.end = end;
- this.value = value;
- return type;
- }
- private int next(Pat pat) {
- if (index >= string.length())
- return set(EOS, string.length(), string.length(), "EOS");
- Matcher m = pat.pattern.matcher(string.substring(index));
- if (!m.find())
- return set(UNKNOWN, index, index, "");
- int type = pat.type;
- if (type == ANY)
- type = m.group().charAt(0);
- set(type, m.start(), m.end(), m.group());
- index += m.end();
- return type;
- }
- public Tokenizer next() {
- next(skip);
- if (type == EOS)
- return this;
- for (Pat p : patterns) {
- next(p);
- if (type != UNKNOWN)
- return this;
- }
- next(any);
- return this;
- }
- @Override
- public String toString() {
- return String.format("Tokenizer(type=%d value=%s string=%s index=%d)", type, value, string, index);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement