Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <vector>
- #include <ctype.h>
- #include <string>
- typedef std::vector<std::string> string_list;
- typedef std::vector<long long > int_list;
- typedef std::vector<long double> float_list;
- std::string substr(const char* value, size_t length){
- std::string v;
- v.resize(length);
- memcpy(&v[0], value, length * sizeof(char));
- return v;
- }
- long long string_to_int(const char* value, size_t length){
- return atoll(substr(value, length).c_str());
- }
- long double string_to_float(const char* value, size_t length){
- return atof(substr(value, length).c_str());
- }
- void int_list_add(int_list& list, long long value){
- list.push_back(value);
- }
- void string_list_add(string_list& list, const char* value, size_t length){
- list.push_back(substr(value, length));
- }
- void float_list_add(float_list& list, long double value){
- list.push_back(value);
- }
- size_t int_list_last(int_list& list){
- return list.size();
- }
- size_t string_list_last(string_list& list){
- return list.size();
- }
- size_t float_list_last(float_list& list){
- return list.size();
- }
- typedef struct{
- string_list identifiers;
- string_list constants_string;
- int_list constants_int;
- float_list constants_float;
- size_t id;
- } *state, state_value;
- state tok_state_create(){
- state ret = new state_value;
- ret->id = 0;
- return ret;
- }
- void tok_state_destroy(state t_state){
- delete t_state;
- }
- const char* tok_state_read_identifier(state t_state, size_t id){
- return t_state->identifiers[id - 1].c_str();
- }
- const char* tok_state_read_string(state t_state, size_t id){
- return t_state->constants_string[id - 1].c_str();
- }
- long long tok_state_read_int(state t_state, size_t id){
- return t_state->constants_int[id - 1];
- }
- long double tok_state_read_float(state t_state, size_t id){
- return t_state->constants_float[id - 1];
- }
- const char* punct_tokens[] = { "Not A Token (Dummy)",
- ".", ",", "<", "<<", ">", ">>",
- ";", "+", "-", "/", "*", "!", "%", "^",
- "&", "(", ")", "=", "==", "[", "]", "{",
- "}", "?", ":", "|", "||", "&&", "~", 0
- };
- const char* key_tokens[] = { "Not A Token (Dummy)",
- "if", "while", "do", "then", "end", 0
- };
- typedef enum{
- TOK_TYPE_INTEGER = 500,
- TOK_TYPE_FLOAT,
- TOK_TYPE_STRING,
- TOK_TYPE_IDENTIFIER,
- TOK_TYPE_NONE
- } tok_type;
- const char* get_token_from_id(size_t id){
- if (id < 100){
- return punct_tokens[id];
- }
- if (id < 200){
- return key_tokens[id - 100];
- }
- if (id >= 500){
- switch (id){
- case TOK_TYPE_INTEGER: return "Integer Constant";
- case TOK_TYPE_FLOAT: return "Float Constant ";
- case TOK_TYPE_STRING: return "String Constant ";
- case TOK_TYPE_IDENTIFIER: return "Identifier ";
- case TOK_TYPE_NONE: return "Unknown ";
- default:
- break;
- }
- }
- return "Not A Token (Dummy)";
- }
- int is_identifier_char(char c){
- if (isalpha(c) || c == '_'){
- return 1;
- }
- return 0;
- }
- size_t read_punct_token(const char* input, size_t size){
- size_t max_len = 0;
- size_t token_id = 0;
- for (size_t i = 1; punct_tokens[i] != 0; ++i){
- size_t len = strlen(punct_tokens[i]);
- if (len > max_len && len <= size && strncmp(punct_tokens[i], input, len) == 0){
- max_len = len;
- if (i == 1 && size > 1 && isdigit(input[1])){
- return 0; //Special case for floats
- }
- token_id = i;
- }
- }
- return token_id;
- }
- size_t read_key_token(const char* input, size_t size){
- size_t max_len = 0;
- size_t token_id = 0;
- for (size_t i = 1; key_tokens[i] != 0; ++i){
- size_t len = strlen(key_tokens[i]);
- if (len > max_len && len <= size && strncmp(key_tokens[i], input, len) == 0){
- max_len = len;
- token_id = i + 100;
- }
- }
- return token_id;
- }
- size_t is_punct_token_char(char c){
- for (size_t i = 1; punct_tokens[i] != 0; ++i){
- if (punct_tokens[i][0] == c){
- return 1;
- }
- }
- return 0;
- }
- void add_token(state t_state, tok_type type, const char* string, size_t length){
- switch (type){
- case TOK_TYPE_INTEGER:
- int_list_add(t_state->constants_int, string_to_int(string, length));
- t_state->id = int_list_last(t_state->constants_int);
- break;
- case TOK_TYPE_FLOAT:
- float_list_add(t_state->constants_float, string_to_float(string, length));
- t_state->id = float_list_last(t_state->constants_float);
- break;
- case TOK_TYPE_STRING:
- string_list_add(t_state->constants_string, string, length);
- t_state->id = string_list_last(t_state->constants_string);
- break;
- case TOK_TYPE_IDENTIFIER:
- string_list_add(t_state->identifiers, string, length);
- t_state->id = string_list_last(t_state->identifiers);
- break;
- default:
- //Do some error here
- break;
- }
- }
- size_t get_token(state t_state, char** input, size_t *size){
- if (t_state->id != 0){
- size_t id = t_state->id;
- t_state->id = 0;
- return id;
- }
- char* base = *input;
- size_t padding = 0;
- size_t length = 0;
- tok_type type = TOK_TYPE_NONE;
- while (*size > 0){
- if (isspace(*base)){
- base++;
- (*size)--;
- }
- else{
- break;
- }
- }
- size_t tok = read_punct_token(base, *size);
- if (tok){
- size_t len = +strlen(get_token_from_id(tok));
- *input = base + len;
- *size -= len;
- return tok;
- }
- tok = read_key_token(base, *size);
- if (tok){
- size_t len = +strlen(get_token_from_id(tok));
- *input = base + len;
- *size -= len;
- return tok;
- }
- while (*size - length > 0){
- if (length == 0 && type == TOK_TYPE_NONE){
- if (is_identifier_char(*base)){
- type = TOK_TYPE_IDENTIFIER;
- length++;
- }
- else if (*base == '"'){
- type = TOK_TYPE_STRING;
- padding = 1;
- base++;
- (*size)--;
- }
- else if (*base == '.' && *size > 1 && isdigit(base[1])){
- type = TOK_TYPE_FLOAT;
- }
- else if (isdigit(*base)){
- type = TOK_TYPE_INTEGER;
- }
- else if (is_punct_token_char(*base)){
- tok = read_punct_token(base, *size);
- if (tok){
- size_t len = strlen(punct_tokens[tok]);
- *input += len;
- *size -= len;
- return tok;
- }
- else{
- //do error
- }
- }
- }
- else{
- if (!isspace(base[length]) || type == TOK_TYPE_STRING){
- switch (type){
- case TOK_TYPE_INTEGER:
- if (isdigit(base[length])){
- length++;
- continue;
- }
- else if (base[length] == '.' || tolower(base[length]) == 'e'){
- type = TOK_TYPE_FLOAT;
- length++;
- continue;
- }
- break;
- case TOK_TYPE_FLOAT:
- if (isdigit(base[length]) || base[length] == '.' || base[length] == 'e'){
- length++;
- continue;
- }
- break;
- case TOK_TYPE_STRING:
- if (base[length] != '"'){
- length++;
- continue;
- }
- break;
- case TOK_TYPE_IDENTIFIER:
- if (is_identifier_char(base[length])){
- length++;
- continue;
- }
- break;
- default:
- break;
- }
- }
- //We only get here if this is a space or any of the switch cases didn't continue.
- add_token(t_state, type, base, length);
- *input = base + length + padding;
- *size -= length + padding;
- return type;
- }
- }
- *input = base + length + padding;
- *size -= length + padding;
- return 0;
- }
- int main(){
- const char* input = "if(1+1==4)then print"hi!";end";
- state s = tok_state_create();
- size_t size = strlen(input);
- size_t token;
- size_t token_prev = 0;
- printf("TokentMeaningnn");
- while ((token = get_token(s, (char**)&input, &size)) != 0){
- if (token_prev < 500){
- if (token < 500){
- printf("%dt%sn", token, get_token_from_id(token));
- }
- else{
- printf("%dt%s #", token, get_token_from_id(token));
- }
- }
- else{
- printf("%dt", token);
- switch (token_prev){
- case TOK_TYPE_IDENTIFIER: printf("%sn", tok_state_read_identifier(s, token)); break;
- case TOK_TYPE_STRING: printf("%sn", tok_state_read_string(s, token)); break;
- case TOK_TYPE_INTEGER: printf("%dn", tok_state_read_int(s, token)); break;
- case TOK_TYPE_FLOAT: printf("%fn", tok_state_read_float(s, token)); break;
- }
- }
- token_prev = token;
- }
- tok_state_destroy(s);
- }
- Token Meaning
- 101 if
- 16 (
- 500 Integer Constant #1 1
- 8 +
- 500 Integer Constant #2 1
- 19 ==
- 500 Integer Constant #3 4
- 17 )
- 104 then
- 503 Identifier #1 print
- 502 String Constant #1 hi!
- 7 ;
- 105 end
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement