Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/python
- import sys
- import types
- import re
- def usage():
- print("""Usage: {} [word] ... [r_num] [r_id]
- Takes all the the words and regular expressions as arguments
- or reads them as input if no arguments are supplied.
- In the second case pass one expressions per line
- and finish with a new linelike below:
- [regex1]
- [regex2]
- ...
- [regexN]
- """.format(sys.argv[0]))
- def create_regex(expressions):
- # creates two lists: one for the given words and on for the regular expressions
- # returns a tuple conatining both lists: (re_objects, words)
- words = expressions[0:-2]
- re_objects = [re.compile(r"{}".format(exp)) for exp in expressions[-2:]]
- return (re_objects, words)
- class Token(object):
- def __init__(self, word, identifier):
- self.word = word
- self.identifier = identifier
- def __str__(self):
- return "({}, {})".format(self.word, self.identifier)
- class TokenManager(object):
- def __init__(self):
- self.tokenDict = {}
- self.tokenList = list()
- def add_token(self, word):
- identifier = None
- try:
- identifier = self.tokenDict[word]
- except:
- identifier = len(self.tokenDict)
- self.tokenDict[word] = identifier
- self.tokenList.append(Token(word, identifier))
- def print_token(self):
- print("Token:")
- for token in self.tokenList:
- print(token)
- def check_word(re_objects, words, word):
- if word in words:
- return True
- for re_object in re_objects:
- result = re_object.match(word)
- if result is not None and result.group(0) != "":
- return True
- return False
- if __name__ == "__main__":
- expressions = list()
- if len(sys.argv) == 1:
- #read expressions from stdin
- read = raw_input()
- while(read != ""):
- expressions.append(read)
- read = raw_input()
- else:
- expressions = sys.argv[1:]
- if len(expressions) == 0:
- usage()
- # two lists: for regular expressions and for constant words/characters
- re_objects = list()
- words = list()
- (re_objects, words) = create_regex(expressions)
- # list for containing the tokens
- results = list()
- # create new TokenManager
- tm = TokenManager()
- print("Now reading input...")
- read = raw_input()
- while read != "":
- if " " in read:
- read = read.split(" ")
- if type(read) in types.StringTypes:
- read = [read]
- for word in read:
- if check_word(re_objects, words, word):
- tm.add_token(word)
- else:
- print("Word not accepted: {}".format(word))
- sys.exit(-1)
- read = raw_input()
- tm.print_token()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement