Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- unit utokenizer;
- {$mode objfpc}{$H+}
- interface
- uses
- Classes, SysUtils;
- const
- {$WARNINGS OFF}
- IdentifierFirstSymbols: set of char = ['A' .. 'Z', 'a' .. 'z'];
- IdentifierSymbols: set of char = ['A' .. 'Z', 'a' .. 'z', '0' .. '9'];
- Digits: set of char = ['0' .. '9'];
- HexDigits: set of char = ['A' .. 'F', 'a' .. 'f', '0' .. '9'];
- ExponentDot = '.';
- ExponentE: set of char = ['E', 'e'];
- Signs: set of char = ['+', '-'];
- Constants: array [0..3] of string = ('true', 'false', 'iota', 'nil');
- Keywords: array [0 .. 24] of string =
- ('break', 'default', 'func', 'interface', 'select',
- 'case', 'defer', 'go', 'map', 'struct',
- 'chan', 'else', 'goto', 'package', 'switch',
- 'const', 'fallthrough', 'if', 'range', 'type',
- 'continue', 'for', 'import', 'return', 'var');
- Operators: array [0..46] of string =
- ('+', '&', '+=', '&=', '&&', '==', '!=', '(', ')', '-', '|', '-=', '|=', '||',
- '<', '<=', '[', ']', '*', '^', '*=', '^=', '<-', '>', '>=', '{', '}', '/', '<<',
- '/=', '<<=', '++', '=', ':=', ',', ';', '%', '>>', '%=', '>>=',
- '--', '!', '...', '.', ':', '&^', '&^=');
- PredeclaredTypes: array [0..19] of string =
- ('bool', 'byte', 'complex64', 'complex128', 'error', 'float32', 'float64',
- 'int', 'int8', 'int16', 'int32', 'int64', 'rune', 'string', 'uint', 'uint8',
- 'uint16', 'uint32', 'uint64', 'uintptr');
- PredeclaredFunctions: array [0..14] of string =
- ('append', 'cap', 'close', 'complex', 'copy', 'delete', 'imag', 'len',
- 'make', 'new', 'panic', 'print', 'println', 'real', 'recover');
- {$WARNINGS ON}
- CharLineEnd = #10;
- CommentsLine = '//';
- CommentsOpen = '/*';
- CommentsClose = '*/';
- type
- TStringQuote =
- (
- sqSingle,
- sqDouble,
- sqSingleAndDouble
- );
- TTokenKind = (tkILLEGAL, tkEOF, tkCOMMENT, tkIDENT,
- tkINT, tkFLOAT, tkIMAG, tkCHAR, tkSTRING,
- tkADD, tkSUB, tkMUL, tkQUO, tkREM, tkAND, tkOR, tkXOR, tkSHL, tkSHR,
- tkAND_NOT, tkADD_ASSIGN, tkSUB_ASSIGN, tkMUL_ASSIGN, tkQUO_ASSIGN,
- tkREM_ASSIGN, tkAND_ASSIGN, tkOR_ASSIGN, tkXOR_ASSIGN, tkSHL_ASSIGN,
- tkSHR_ASSIGN, tkAND_NOT_ASSIGN, tkLAND, tkLOR, tkARROW, tkINC, tkDEC,
- tkEQL, tkLSS, tkGTR, tkASSIGN, tkNOT, tkNEQ, tkLEQ, tkGEQ, tkDEFINE,
- tkELLIPSIS, tkLPAREN, tkLBRACK, tkLBRACE, tkCOMMA, tkPERIOD, tkRPAREN,
- tkRBRACK, tkRBRACE, tkSEMICOLON, tkCOLON,
- tkBREAK, tkCASE, tkCHAN, tkCONST, tkCONTINUE, tkDEFAULT, tkDEFER,
- tkELSE, tkFALLTHROUGH, tkFOR, tkFUNC, tkGO, tkGOTO, tkIF, tkIMPORT,
- tkINTERFACE, tkMAP, tkPACKAGE, tkRANGE, tkRETURN, tkSELECT, tkSTRUCT,
- tkSWITCH, tkTYPE, tkVAR, tkSTRIDENT,tkSKIPRESULT, tkUnknown);
- TPrecedence = (LowestPrec = 0 // non-operators
- , UnaryPrec = 6, HighestPrec = 7);
- const
- StringQuote = sqSingleAndDouble;
- type
- PToken = ^TToken;
- TToken = record
- Token: string;
- TextPos: integer;
- LexType: TTokenKind;
- end;
- TTokens = array of TToken;
- //TPTokens = array of PToken;
- { TTokenizer }
- TTokenizer = class(TObject)
- private
- CurrPos: integer;
- CurrTokenPos: integer;
- CurrChar: char;
- CurrToken: string;
- FTokensList: TTokens;
- FTokensCount: integer;
- FSource: string;
- procedure GetNextChar;
- procedure Add(Token: string; Pos: integer; LType: TTokenKind);
- procedure GetNumber;
- procedure GetIdentifier;
- procedure GetOthers;
- function isKeyword(const Ident: string): boolean;
- function isType(const Ident: string): boolean;
- function IdentToTokenKind(const Ident: string): TTokenKind;
- function NextChar: char;
- { function GenerateAnsi(const Str: string): string;}
- function GetToken(ind: integer): TToken;
- procedure SetSource(src: string);
- public
- constructor Create;
- destructor Destroy; override;
- procedure Analyze;
- function GetNextToken: PToken;
- function NextToken: PToken;
- function GetCurrentToken: PToken;
- function GetPreviousToken(const Step: integer): PToken;
- property Token[ind: integer]: TToken read GetToken;
- property TokensCount: integer read FTokensCount;
- property Source: string read FSource write SetSource;
- end;
- implementation
- uses utils;
- { TTokenizer }
- procedure TTokenizer.Add(Token: string; Pos: integer; LType: TTokenKind);
- begin
- if Token = '' then
- Exit;
- Inc(FTokensCount);
- SetLength(FTokensList, FTokensCount);
- FTokensList[FTokensCount - 1].Token := Token;
- FTokensList[FTokensCount - 1].TextPos := Pos;
- FTokensList[FTokensCount - 1].LexType := LType;
- end;
- procedure TTokenizer.Analyze;
- var
- tokenType: TTokenKind;
- begin
- SetLength(FTokensList, 0);
- GetNextChar;
- while CurrChar <> #0 do
- begin
- if CurrPos > Length(FSource) then
- Break;
- case CurrChar of
- 'A' .. 'Z', 'a' .. 'z':
- begin
- CurrToken := '';
- GetIdentifier;
- if isKeyword(CurrToken) then
- tokenType := IdentToTokenKind(CurrToken)
- else
- if isType(LowerCase(CurrToken)) and (Length(CurrToken) > 2) then
- tokenType := IdentToTokenKind(UpperCase(CurrToken))
- else
- tokenType := tkIdent;
- if CurrToken <> '' then
- Add(CurrToken, (CurrPos - 1) - Length(CurrToken), tokenType);
- CurrToken := '';
- end;
- '0' .. '9':
- begin
- CurrToken := '';
- GetNumber;
- if CurrToken <> '' then
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkInt);
- CurrToken := '';
- end;
- #1 .. #20, ' ':
- begin
- GetNextChar;
- end;
- else
- begin
- CurrToken := '';
- GetOthers;
- if CurrToken <> '' then
- begin
- tokenType := IdentToTokenKind(CurrToken);
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tokenType);
- end
- else
- GetNextChar;
- CurrToken := '';
- //GetNextChar;
- end;
- //GetNextChar();
- end;
- end;
- end;
- function TTokenizer.GetNextToken: PToken;
- begin
- Inc(CurrTokenPos);
- if CurrTokenPos > FTokensCount - 1 then
- result := nil
- else
- result := @FTokensList[CurrTokenPos];
- end;
- function TTokenizer.NextToken: PToken;
- begin
- if CurrTokenPos + 1 > FTokensCount - 1 then
- Result := nil
- else
- Result := @FTokensList[CurrTokenPos+1];
- end;
- function TTokenizer.GetCurrentToken: PToken;
- begin
- if CurrTokenPos > (FTokensCount - 1) then
- Exit;
- Result := @FTokensList[CurrTokenPos];
- end;
- function TTokenizer.GetPreviousToken(const Step: integer): PToken;
- begin
- if (CurrTokenPos = 0) or ((CurrTokenPos - Step) < 0) then
- Exit;
- Result := @FTokensList[CurrTokenPos-step];
- end;
- constructor TTokenizer.Create;
- begin
- inherited;
- FTokensList := nil;
- FTokensCount := 0;
- CurrPos := 0;
- CurrTokenPos := 0;
- CurrChar := #0;
- CurrToken := '';
- end;
- destructor TTokenizer.Destroy;
- begin
- SetLength(FTokensList, 0);
- inherited;
- end;
- procedure TTokenizer.GetIdentifier;
- begin
- CurrToken := CurrChar;
- GetNextChar;
- while CharInSet(CurrChar, IdentifierSymbols) do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- end;
- function TTokenizer.GetToken(ind: integer): TToken;
- begin
- if ind > (FTokensCount - 1) then
- Exit;
- Result := FTokensList[ind];
- end;
- procedure TTokenizer.GetNextChar;
- begin
- Inc(CurrPos);
- if CurrPos > Length(FSource) then
- CurrChar := #0
- else
- CurrChar := FSource[CurrPos];
- end;
- procedure TTokenizer.GetNumber;
- begin
- CurrToken := CurrChar;
- GetNextChar;
- while CharInSet(CurrChar, Digits) do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- if CurrChar <> ExponentDot then
- Exit;
- if not CharInSet(NextChar, Digits) then
- Exit;
- CurrToken := CurrToken + ExponentDot;
- GetNextChar;
- while CharInSet(CurrChar, Digits) do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- if not CharInSet(CurrChar, ExponentE) then
- Exit;
- CurrToken := CurrToken + 'E';
- GetNextChar;
- if CharInSet(CurrChar, Signs) then
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- while CharInSet(CurrChar, Digits) do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- end;
- procedure TTokenizer.GetOthers;
- var
- i: integer;
- begin
- case CurrChar of
- ';', '(', ')', '[', ']', ',', '@', '{', '}','_':
- begin
- CurrToken := CurrChar;
- GetNextChar;
- Exit;
- end;
- '!':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '.':
- begin
- CurrToken := CurrChar;
- if (NextChar = '.') then
- begin
- while NextChar = '.' do
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- ':':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '=':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '&':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') or (NextChar = '^') or (NextChar = '&') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '<':
- begin
- CurrToken := CurrChar;
- if (NextChar = '<') or (NextChar = '-') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '>':
- begin
- CurrToken := CurrChar;
- if (NextChar = '>') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '|':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') or (NextChar = '|') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '^':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '%':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '+':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') or (NextChar = '+') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '-':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') or (NextChar = '-') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '*':
- begin
- CurrToken := CurrChar;
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- end;
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)),
- IdentToTokenKind(CurrToken));
- CurrToken := '';
- //GetNextChar;
- end;
- '"':
- begin
- if (StringQuote = sqDouble) or (StringQuote = sqSingleAndDouble) then
- begin
- Add('"', CurrPos, tkStrIdent);
- GetNextChar;
- while CurrChar <> '"' do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- if CurrToken <> '' then
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkString);
- CurrToken := '"';
- GetNextChar;
- { Add('"', CurrPos, 2);
- GetNextChar; }
- end;
- Exit;
- end;
- '/':
- begin
- if NextChar = '*' then
- begin
- GetNextChar;
- CurrToken := '/';
- while CurrChar <> '/' do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- CurrToken := CurrToken + '/';
- if CurrToken <> '' then
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment);
- CurrToken := '';
- //GetNextChar;
- end
- else
- if NextChar = '/' then
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- while CurrChar <> #10 do
- begin
- CurrToken := CurrToken + CurrChar;
- GetNextChar;
- end;
- if CurrToken <> '' then
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment);
- CurrToken := '';
- //GetNextChar;
- end
- else
- if (NextChar = '=') then
- begin
- GetNextChar;
- CurrToken := CurrToken + CurrChar;
- if CurrToken <> '' then
- Add(CurrToken, CurrPos - cardinal(Length(CurrToken)), tkComment);
- CurrToken := '';
- //GetNextChar;
- end;
- end
- else
- Exit;
- end;
- end;
- function TTokenizer.isKeyword(const Ident: string): boolean;
- var
- i: integer;
- begin
- Result := False;
- for i := low(Keywords) to high(Keywords) do
- if eq(Keywords[i], Ident) then
- begin
- Result := True;
- break;
- end;
- end;
- function TTokenizer.isType(const Ident: string): boolean;
- var
- i: integer;
- begin
- Result := False;
- for i := low(PredeclaredTypes) to high(PredeclaredTypes) do
- if Pos(ident, predeclaredTypes[i]) > 0 then
- begin
- Result := True;
- break;
- end;
- end;
- function TTokenizer.IdentToTokenKind(const Ident: string): TTokenKind;
- begin
- Result := tkUnknown;
- case ident of
- 'ILLEGAL': Result := tkILLEGAL;
- 'EOF': Result := tkEOF;
- 'COMMENT': Result := tkCOMMENT;
- 'IDENT': Result := tkIDENT;
- 'INT': Result := tkINT;
- 'FLOAT': Result := tkFLOAT;
- 'IMAG': Result := tkIMAG;
- 'CHAR': Result := tkCHAR;
- 'STRING': Result := tkSTRING;
- '_': Result := tkSKIPRESULT;
- '+': Result := tkADD;
- '-': Result := tkSUB;
- '*': Result := tkMUL;
- '/': Result := tkQUO;
- '%': Result := tkREM;
- '&': Result := tkAND;
- '|': Result := tkOR;
- '^': Result := tkXOR;
- '<<': Result := tkSHL;
- '>>': Result := tkSHR;
- '&^': Result := tkAND_NOT;
- '+=': Result := tkADD_ASSIGN;
- '-=': Result := tkSUB_ASSIGN;
- '*=': Result := tkMUL_ASSIGN;
- '/=': Result := tkQUO_ASSIGN;
- '%=': Result := tkREM_ASSIGN;
- '&=': Result := tkAND_ASSIGN;
- '|=': Result := tkOR_ASSIGN;
- '^=': Result := tkXOR_ASSIGN;
- '<<=': Result := tkSHL_ASSIGN;
- '>>=': Result := tkSHR_ASSIGN;
- '&^=': Result := tkAND_NOT_ASSIGN;
- '&&': Result := tkLAND;
- '||': Result := tkLOR;
- '<-': Result := tkARROW;
- '++': Result := tkINC;
- '--': Result := tkDEC;
- '==': Result := tkEQL;
- '<': Result := tkLSS;
- '>': Result := tkGTR;
- '=': Result := tkASSIGN;
- '!': Result := tkNOT;
- '!=': Result := tkNEQ;
- '<=': Result := tkLEQ;
- '>=': Result := tkGEQ;
- ':=': Result := tkDEFINE;
- '...': Result := tkELLIPSIS;
- '(': Result := tkLPAREN;
- '[': Result := tkLBRACK;
- '{': Result := tkLBRACE;
- '.': Result := tkPERIOD;
- ',': Result := tkCOMMA;
- ')': Result := tkRPAREN;
- ']': Result := tkRBRACK;
- '}': Result := tkRBRACE;
- ';': Result := tkSEMICOLON;
- ':': Result := tkCOLON;
- '"': Result := tkSTRIDENT;
- 'break': Result := tkBREAK;
- 'case': Result := tkCASE;
- 'chan': Result := tkCHAN;
- 'const': Result := tkCONST;
- 'continue': Result := tkCONTINUE;
- 'default': Result := tkDEFAULT;
- 'defer': Result := tkDEFER;
- 'else': Result := tkELSE;
- 'fallthrough': Result := tkFALLTHROUGH;
- 'for': Result := tkFOR;
- 'func': Result := tkFUNC;
- 'go': Result := tkGO;
- 'goto': Result := tkGOTO;
- 'if': Result := tkIF;
- 'import': Result := tkIMPORT;
- 'interface': Result := tkINTERFACE;
- 'map': Result := tkMAP;
- 'package': Result := tkPACKAGE;
- 'range': Result := tkRANGE;
- 'return': Result := tkRETURN;
- 'select': Result := tkSELECT;
- 'struct': Result := tkSTRUCT;
- 'switch': Result := tkSWITCH;
- 'type': Result := tkTYPE;
- 'var': Result := tkVAR;
- end;
- end;
- function TTokenizer.NextChar: char;
- begin
- if CurrPos + 1 > Length(FSource) then
- Result := #0
- else
- Result := FSource[CurrPos + 1];
- end;
- procedure TTokenizer.SetSource(src: string);
- var
- I, j: integer;
- tmp: string;
- begin
- try
- tmp := src;
- finally
- Self.FSource := tmp;
- tmp := '';
- end;
- end;
- end.
Advertisement
Add Comment
Please, Sign In to add comment